{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.9508492952656304, "global_step": 22000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1391890048980713, "epoch": 0.0, "learning_rate": 1.9999982096052276e-06, "loss": 0.1776, "step": 10, "task_loss": 0.3535611927509308 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1461760699748993, "epoch": 0.01, "learning_rate": 1.999992838427322e-06, "loss": 0.1969, "step": 20, "task_loss": 0.3184128403663635 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16645857691764832, "epoch": 0.01, "learning_rate": 1.9999838864855164e-06, "loss": 0.1698, "step": 30, "task_loss": 0.42190682888031006 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1625981330871582, "epoch": 0.01, "learning_rate": 1.999971353811865e-06, "loss": 0.1782, "step": 40, "task_loss": 0.23675988614559174 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14059175550937653, "epoch": 0.02, "learning_rate": 1.9999552404512455e-06, "loss": 0.1794, "step": 50, "task_loss": 0.3601588010787964 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19353535771369934, "epoch": 0.02, "learning_rate": 1.9999355464613565e-06, "loss": 0.1838, "step": 60, "task_loss": 0.5550195574760437 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.181913822889328, "epoch": 0.03, "learning_rate": 1.999912271912717e-06, "loss": 0.1859, "step": 70, "task_loss": 0.3225249648094177 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13254797458648682, "epoch": 0.03, "learning_rate": 1.999885416888669e-06, "loss": 0.1691, "step": 80, "task_loss": 0.2899574637413025 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16020077466964722, "epoch": 0.03, "learning_rate": 1.999854981485375e-06, "loss": 0.1803, "step": 90, "task_loss": 0.43814536929130554 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16687241196632385, "epoch": 0.04, "learning_rate": 1.999820965811817e-06, "loss": 0.1859, "step": 100, "task_loss": 0.746590256690979 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.155348539352417, "epoch": 0.04, "learning_rate": 1.9997833699897987e-06, "loss": 0.1793, "step": 110, "task_loss": 0.45200714468955994 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1690862774848938, "epoch": 0.04, "learning_rate": 1.999742194153942e-06, "loss": 0.1799, "step": 120, "task_loss": 0.2519175112247467 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18965381383895874, "epoch": 0.05, "learning_rate": 1.99969743845169e-06, "loss": 0.1775, "step": 130, "task_loss": 0.35963237285614014 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14351242780685425, "epoch": 0.05, "learning_rate": 1.9996491030433027e-06, "loss": 0.1842, "step": 140, "task_loss": 0.5129216313362122 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19938349723815918, "epoch": 0.05, "learning_rate": 1.999597188101859e-06, "loss": 0.1822, "step": 150, "task_loss": 0.6201828718185425 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15174296498298645, "epoch": 0.06, "learning_rate": 1.9995416938132554e-06, "loss": 0.1762, "step": 160, "task_loss": 0.2371373474597931 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14393183588981628, "epoch": 0.06, "learning_rate": 1.9994826203762056e-06, "loss": 0.1756, "step": 170, "task_loss": 0.3284216523170471 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15572452545166016, "epoch": 0.07, "learning_rate": 1.9994199680022386e-06, "loss": 0.1785, "step": 180, "task_loss": 0.17561133205890656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.160763680934906, "epoch": 0.07, "learning_rate": 1.9993537369157004e-06, "loss": 0.1814, "step": 190, "task_loss": 0.30648908019065857 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13948312401771545, "epoch": 0.07, "learning_rate": 1.9992839273537492e-06, "loss": 0.1719, "step": 200, "task_loss": 0.5067750215530396 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19486072659492493, "epoch": 0.08, "learning_rate": 1.9992105395663598e-06, "loss": 0.1853, "step": 210, "task_loss": 0.4411253333091736 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14867699146270752, "epoch": 0.08, "learning_rate": 1.999133573816317e-06, "loss": 0.1812, "step": 220, "task_loss": 0.6160109639167786 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14178313314914703, "epoch": 0.08, "learning_rate": 1.99905303037922e-06, "loss": 0.1844, "step": 230, "task_loss": 0.47401952743530273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15987926721572876, "epoch": 0.09, "learning_rate": 1.9989689095434775e-06, "loss": 0.174, "step": 240, "task_loss": 0.4866279065608978 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1667332649230957, "epoch": 0.09, "learning_rate": 1.9988812116103086e-06, "loss": 0.1783, "step": 250, "task_loss": 0.5632772445678711 }, { "epoch": 0.09, "eval_exact_match": 83.68968779564806, "eval_f1": 90.07662178846462, "step": 250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13907156884670258, "epoch": 0.09, "learning_rate": 1.998789936893741e-06, "loss": 0.1674, "step": 260, "task_loss": 0.25233495235443115 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12035252153873444, "epoch": 0.1, "learning_rate": 1.99869508572061e-06, "loss": 0.1682, "step": 270, "task_loss": 0.33309412002563477 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1731790155172348, "epoch": 0.1, "learning_rate": 1.9985966584305585e-06, "loss": 0.18, "step": 280, "task_loss": 0.38126981258392334 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1732906997203827, "epoch": 0.1, "learning_rate": 1.9984946553760333e-06, "loss": 0.1723, "step": 290, "task_loss": 0.3908073902130127 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17171213030815125, "epoch": 0.11, "learning_rate": 1.998389076922286e-06, "loss": 0.1833, "step": 300, "task_loss": 0.3038800358772278 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17187830805778503, "epoch": 0.11, "learning_rate": 1.9982799234473707e-06, "loss": 0.1727, "step": 310, "task_loss": 0.5874561071395874 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14011260867118835, "epoch": 0.12, "learning_rate": 1.998167195342143e-06, "loss": 0.1728, "step": 320, "task_loss": 0.466874361038208 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14882700145244598, "epoch": 0.12, "learning_rate": 1.998050893010259e-06, "loss": 0.1806, "step": 330, "task_loss": 0.5158421993255615 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2140088677406311, "epoch": 0.12, "learning_rate": 1.9979310168681726e-06, "loss": 0.1776, "step": 340, "task_loss": 0.49200907349586487 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1642945408821106, "epoch": 0.13, "learning_rate": 1.9978075673451348e-06, "loss": 0.1922, "step": 350, "task_loss": 0.2854197919368744 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12045970559120178, "epoch": 0.13, "learning_rate": 1.9976805448831925e-06, "loss": 0.1795, "step": 360, "task_loss": 0.3827168345451355 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16507384181022644, "epoch": 0.13, "learning_rate": 1.9975499499371862e-06, "loss": 0.173, "step": 370, "task_loss": 0.22587484121322632 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15693482756614685, "epoch": 0.14, "learning_rate": 1.99741578297475e-06, "loss": 0.1795, "step": 380, "task_loss": 0.47314953804016113 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15382714569568634, "epoch": 0.14, "learning_rate": 1.9972780444763056e-06, "loss": 0.169, "step": 390, "task_loss": 0.46679824590682983 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1490176022052765, "epoch": 0.14, "learning_rate": 1.9971367349350676e-06, "loss": 0.169, "step": 400, "task_loss": 0.4132170081138611 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1326831877231598, "epoch": 0.15, "learning_rate": 1.9969918548570343e-06, "loss": 0.1712, "step": 410, "task_loss": 0.1556464433670044 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17481349408626556, "epoch": 0.15, "learning_rate": 1.9968434047609913e-06, "loss": 0.1751, "step": 420, "task_loss": 0.3467264771461487 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1614687293767929, "epoch": 0.16, "learning_rate": 1.9966913851785074e-06, "loss": 0.1698, "step": 430, "task_loss": 0.38117820024490356 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19468063116073608, "epoch": 0.16, "learning_rate": 1.996535796653933e-06, "loss": 0.1758, "step": 440, "task_loss": 0.30559083819389343 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18599817156791687, "epoch": 0.16, "learning_rate": 1.996376639744396e-06, "loss": 0.18, "step": 450, "task_loss": 0.5101648569107056 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14922016859054565, "epoch": 0.17, "learning_rate": 1.996213915019806e-06, "loss": 0.1767, "step": 460, "task_loss": 0.39836177229881287 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.133830264210701, "epoch": 0.17, "learning_rate": 1.9960476230628453e-06, "loss": 0.1811, "step": 470, "task_loss": 0.5501172542572021 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15386879444122314, "epoch": 0.17, "learning_rate": 1.9958777644689696e-06, "loss": 0.1752, "step": 480, "task_loss": 0.7182563543319702 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17507609724998474, "epoch": 0.18, "learning_rate": 1.995704339846408e-06, "loss": 0.1801, "step": 490, "task_loss": 0.40363389253616333 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17033545672893524, "epoch": 0.18, "learning_rate": 1.9955273498161563e-06, "loss": 0.1853, "step": 500, "task_loss": 0.5597988367080688 }, { "epoch": 0.18, "eval_exact_match": 83.69914853358561, "eval_f1": 90.06682101600445, "step": 500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16981837153434753, "epoch": 0.18, "learning_rate": 1.9953467950119794e-06, "loss": 0.1718, "step": 510, "task_loss": 0.34875768423080444 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12693539261817932, "epoch": 0.19, "learning_rate": 1.9951626760804064e-06, "loss": 0.1762, "step": 520, "task_loss": 0.3869848847389221 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14310693740844727, "epoch": 0.19, "learning_rate": 1.9949749936807275e-06, "loss": 0.1714, "step": 530, "task_loss": 0.3260875344276428 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1642155647277832, "epoch": 0.2, "learning_rate": 1.9947837484849944e-06, "loss": 0.1753, "step": 540, "task_loss": 0.25048545002937317 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16132649779319763, "epoch": 0.2, "learning_rate": 1.9945889411780158e-06, "loss": 0.1722, "step": 550, "task_loss": 0.34237614274024963 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1687171906232834, "epoch": 0.2, "learning_rate": 1.9943905724573555e-06, "loss": 0.162, "step": 560, "task_loss": 0.3560226559638977 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15636363625526428, "epoch": 0.21, "learning_rate": 1.99418864303333e-06, "loss": 0.1733, "step": 570, "task_loss": 0.559543788433075 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16653823852539062, "epoch": 0.21, "learning_rate": 1.993983153629007e-06, "loss": 0.1774, "step": 580, "task_loss": 0.36561131477355957 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14254853129386902, "epoch": 0.21, "learning_rate": 1.9937741049802e-06, "loss": 0.1763, "step": 590, "task_loss": 0.3504565358161926 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14156261086463928, "epoch": 0.22, "learning_rate": 1.9935614978354687e-06, "loss": 0.171, "step": 600, "task_loss": 0.4731927812099457 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.24176684021949768, "epoch": 0.22, "learning_rate": 1.993345332956114e-06, "loss": 0.18, "step": 610, "task_loss": 0.44777655601501465 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18350887298583984, "epoch": 0.22, "learning_rate": 1.9931256111161768e-06, "loss": 0.1735, "step": 620, "task_loss": 0.38023048639297485 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1785336434841156, "epoch": 0.23, "learning_rate": 1.9929023331024354e-06, "loss": 0.1838, "step": 630, "task_loss": 0.4373171329498291 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11950236558914185, "epoch": 0.23, "learning_rate": 1.992675499714401e-06, "loss": 0.1823, "step": 640, "task_loss": 0.4059186577796936 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1564953625202179, "epoch": 0.23, "learning_rate": 1.992445111764316e-06, "loss": 0.1766, "step": 650, "task_loss": 0.41068634390830994 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17433582246303558, "epoch": 0.24, "learning_rate": 1.9922111700771514e-06, "loss": 0.1789, "step": 660, "task_loss": 0.3949933350086212 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16213908791542053, "epoch": 0.24, "learning_rate": 1.9919736754906037e-06, "loss": 0.1664, "step": 670, "task_loss": 0.4205024242401123 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19312430918216705, "epoch": 0.25, "learning_rate": 1.99173262885509e-06, "loss": 0.1738, "step": 680, "task_loss": 0.38273149728775024 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15186524391174316, "epoch": 0.25, "learning_rate": 1.991488031033748e-06, "loss": 0.1775, "step": 690, "task_loss": 0.43976613879203796 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16413924098014832, "epoch": 0.25, "learning_rate": 1.9912398829024316e-06, "loss": 0.1846, "step": 700, "task_loss": 0.4296082854270935 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1467546969652176, "epoch": 0.26, "learning_rate": 1.9909881853497063e-06, "loss": 0.1829, "step": 710, "task_loss": 0.2708396017551422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14315572381019592, "epoch": 0.26, "learning_rate": 1.990732939276848e-06, "loss": 0.1761, "step": 720, "task_loss": 0.28182071447372437 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20183299481868744, "epoch": 0.26, "learning_rate": 1.9904741455978396e-06, "loss": 0.1863, "step": 730, "task_loss": 1.0296525955200195 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16263312101364136, "epoch": 0.27, "learning_rate": 1.990211805239367e-06, "loss": 0.1836, "step": 740, "task_loss": 0.7796942591667175 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17141658067703247, "epoch": 0.27, "learning_rate": 1.989945919140815e-06, "loss": 0.1751, "step": 750, "task_loss": 0.40661606192588806 }, { "epoch": 0.27, "eval_exact_match": 83.57615894039735, "eval_f1": 89.9443629076221, "step": 750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20380395650863647, "epoch": 0.27, "learning_rate": 1.9896764882542666e-06, "loss": 0.173, "step": 760, "task_loss": 0.2610260844230652 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1519322693347931, "epoch": 0.28, "learning_rate": 1.9894035135444964e-06, "loss": 0.1762, "step": 770, "task_loss": 0.4701826870441437 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16315855085849762, "epoch": 0.28, "learning_rate": 1.9891269959889698e-06, "loss": 0.1768, "step": 780, "task_loss": 0.4588128626346588 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2139987349510193, "epoch": 0.29, "learning_rate": 1.988846936577838e-06, "loss": 0.1711, "step": 790, "task_loss": 0.509343147277832 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14751572906970978, "epoch": 0.29, "learning_rate": 1.9885633363139344e-06, "loss": 0.1943, "step": 800, "task_loss": 0.571614146232605 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16162118315696716, "epoch": 0.29, "learning_rate": 1.9882761962127727e-06, "loss": 0.1629, "step": 810, "task_loss": 0.3844223618507385 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1602107584476471, "epoch": 0.3, "learning_rate": 1.9879855173025404e-06, "loss": 0.1685, "step": 820, "task_loss": 0.3083072602748871 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14762470126152039, "epoch": 0.3, "learning_rate": 1.9876913006240975e-06, "loss": 0.1686, "step": 830, "task_loss": 0.421251118183136 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1615067422389984, "epoch": 0.3, "learning_rate": 1.9873935472309726e-06, "loss": 0.1734, "step": 840, "task_loss": 0.3794122338294983 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14823423326015472, "epoch": 0.31, "learning_rate": 1.9870922581893573e-06, "loss": 0.1748, "step": 850, "task_loss": 0.3405918478965759 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1978568136692047, "epoch": 0.31, "learning_rate": 1.9867874345781048e-06, "loss": 0.1775, "step": 860, "task_loss": 0.5284197330474854 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22010980546474457, "epoch": 0.31, "learning_rate": 1.9864790774887234e-06, "loss": 0.1765, "step": 870, "task_loss": 0.35552144050598145 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17657122015953064, "epoch": 0.32, "learning_rate": 1.986167188025376e-06, "loss": 0.1685, "step": 880, "task_loss": 0.3586549460887909 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15283505618572235, "epoch": 0.32, "learning_rate": 1.985851767304873e-06, "loss": 0.1852, "step": 890, "task_loss": 0.48049455881118774 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17572566866874695, "epoch": 0.33, "learning_rate": 1.985532816456669e-06, "loss": 0.1661, "step": 900, "task_loss": 0.4304676055908203 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12883397936820984, "epoch": 0.33, "learning_rate": 1.98521033662286e-06, "loss": 0.1767, "step": 910, "task_loss": 0.37426260113716125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14257290959358215, "epoch": 0.33, "learning_rate": 1.984884328958179e-06, "loss": 0.191, "step": 920, "task_loss": 0.6537871360778809 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14154499769210815, "epoch": 0.34, "learning_rate": 1.9845547946299902e-06, "loss": 0.1865, "step": 930, "task_loss": 0.5936552286148071 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1575993001461029, "epoch": 0.34, "learning_rate": 1.984221734818287e-06, "loss": 0.169, "step": 940, "task_loss": 0.42563629150390625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15309542417526245, "epoch": 0.34, "learning_rate": 1.9838851507156864e-06, "loss": 0.1771, "step": 950, "task_loss": 0.5308173894882202 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1929389238357544, "epoch": 0.35, "learning_rate": 1.983545043527425e-06, "loss": 0.1703, "step": 960, "task_loss": 0.5199047923088074 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15166574716567993, "epoch": 0.35, "learning_rate": 1.9832014144713554e-06, "loss": 0.1738, "step": 970, "task_loss": 0.3604187071323395 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14696621894836426, "epoch": 0.35, "learning_rate": 1.9828542647779415e-06, "loss": 0.1776, "step": 980, "task_loss": 0.3480679392814636 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17910030484199524, "epoch": 0.36, "learning_rate": 1.9825035956902515e-06, "loss": 0.184, "step": 990, "task_loss": 0.3781590461730957 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18165017664432526, "epoch": 0.36, "learning_rate": 1.9821494084639595e-06, "loss": 0.1759, "step": 1000, "task_loss": 0.5116103887557983 }, { "epoch": 0.36, "eval_exact_match": 83.45316934720908, "eval_f1": 89.78098563856513, "step": 1000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17691287398338318, "epoch": 0.37, "learning_rate": 1.9817917043673343e-06, "loss": 0.1735, "step": 1010, "task_loss": 0.5271925330162048 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17111527919769287, "epoch": 0.37, "learning_rate": 1.9814304846812396e-06, "loss": 0.1766, "step": 1020, "task_loss": 0.28790348768234253 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1404891461133957, "epoch": 0.37, "learning_rate": 1.981065750699127e-06, "loss": 0.1743, "step": 1030, "task_loss": 0.565461277961731 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.121620774269104, "epoch": 0.38, "learning_rate": 1.980697503727031e-06, "loss": 0.1746, "step": 1040, "task_loss": 0.3549140691757202 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13501675426959991, "epoch": 0.38, "learning_rate": 1.9803257450835683e-06, "loss": 0.172, "step": 1050, "task_loss": 0.6684577465057373 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.137796550989151, "epoch": 0.38, "learning_rate": 1.9799504760999275e-06, "loss": 0.1804, "step": 1060, "task_loss": 0.3713378310203552 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14667566120624542, "epoch": 0.39, "learning_rate": 1.9795716981198676e-06, "loss": 0.1728, "step": 1070, "task_loss": 0.35704660415649414 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21063314378261566, "epoch": 0.39, "learning_rate": 1.979189412499713e-06, "loss": 0.194, "step": 1080, "task_loss": 0.5208743810653687 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13344134390354156, "epoch": 0.39, "learning_rate": 1.9788036206083484e-06, "loss": 0.1723, "step": 1090, "task_loss": 0.27669817209243774 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17827028036117554, "epoch": 0.4, "learning_rate": 1.9784143238272128e-06, "loss": 0.1615, "step": 1100, "task_loss": 0.3210203945636749 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16757118701934814, "epoch": 0.4, "learning_rate": 1.9780215235502968e-06, "loss": 0.1686, "step": 1110, "task_loss": 0.43170055747032166 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15356716513633728, "epoch": 0.4, "learning_rate": 1.9776252211841346e-06, "loss": 0.1724, "step": 1120, "task_loss": 0.3005247116088867 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16435839235782623, "epoch": 0.41, "learning_rate": 1.977225418147802e-06, "loss": 0.1757, "step": 1130, "task_loss": 0.28998297452926636 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.09947463124990463, "epoch": 0.41, "learning_rate": 1.97682211587291e-06, "loss": 0.1595, "step": 1140, "task_loss": 0.36723411083221436 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18333488702774048, "epoch": 0.42, "learning_rate": 1.976415315803599e-06, "loss": 0.1878, "step": 1150, "task_loss": 0.42832082509994507 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13132742047309875, "epoch": 0.42, "learning_rate": 1.9760050193965333e-06, "loss": 0.174, "step": 1160, "task_loss": 0.3157771825790405 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.155666783452034, "epoch": 0.42, "learning_rate": 1.9755912281208997e-06, "loss": 0.1633, "step": 1170, "task_loss": 0.30877023935317993 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2435683161020279, "epoch": 0.43, "learning_rate": 1.9751739434583966e-06, "loss": 0.1782, "step": 1180, "task_loss": 0.35170674324035645 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1864657998085022, "epoch": 0.43, "learning_rate": 1.9747531669032326e-06, "loss": 0.1886, "step": 1190, "task_loss": 0.504147469997406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14224031567573547, "epoch": 0.43, "learning_rate": 1.97432889996212e-06, "loss": 0.1674, "step": 1200, "task_loss": 0.47088128328323364 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13773639500141144, "epoch": 0.44, "learning_rate": 1.9739011441542703e-06, "loss": 0.1722, "step": 1210, "task_loss": 0.41278308629989624 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18634013831615448, "epoch": 0.44, "learning_rate": 1.973469901011386e-06, "loss": 0.1812, "step": 1220, "task_loss": 0.6850751042366028 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17051789164543152, "epoch": 0.44, "learning_rate": 1.973035172077658e-06, "loss": 0.1677, "step": 1230, "task_loss": 0.41830068826675415 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1463625729084015, "epoch": 0.45, "learning_rate": 1.97259695890976e-06, "loss": 0.1775, "step": 1240, "task_loss": 0.5584670305252075 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1390727460384369, "epoch": 0.45, "learning_rate": 1.9721552630768407e-06, "loss": 0.1766, "step": 1250, "task_loss": 0.5843634605407715 }, { "epoch": 0.45, "eval_exact_match": 83.57615894039735, "eval_f1": 89.93850085654329, "step": 1250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14995011687278748, "epoch": 0.46, "learning_rate": 1.9717100861605196e-06, "loss": 0.1696, "step": 1260, "task_loss": 0.7748905420303345 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.10256899893283844, "epoch": 0.46, "learning_rate": 1.971261429754882e-06, "loss": 0.1758, "step": 1270, "task_loss": 0.14379727840423584 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13538025319576263, "epoch": 0.46, "learning_rate": 1.970809295466472e-06, "loss": 0.1922, "step": 1280, "task_loss": 0.4599601924419403 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1672298014163971, "epoch": 0.47, "learning_rate": 1.9703536849142864e-06, "loss": 0.1801, "step": 1290, "task_loss": 0.364857017993927 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13718703389167786, "epoch": 0.47, "learning_rate": 1.9698945997297722e-06, "loss": 0.1898, "step": 1300, "task_loss": 0.32751551270484924 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15767154097557068, "epoch": 0.47, "learning_rate": 1.969432041556816e-06, "loss": 0.191, "step": 1310, "task_loss": 0.5771661400794983 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16633427143096924, "epoch": 0.48, "learning_rate": 1.968966012051741e-06, "loss": 0.1904, "step": 1320, "task_loss": 0.20087338984012604 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18319915235042572, "epoch": 0.48, "learning_rate": 1.9684965128833016e-06, "loss": 0.1653, "step": 1330, "task_loss": 0.3794475197792053 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14207670092582703, "epoch": 0.48, "learning_rate": 1.968023545732675e-06, "loss": 0.1643, "step": 1340, "task_loss": 0.44781196117401123 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18061572313308716, "epoch": 0.49, "learning_rate": 1.967547112293457e-06, "loss": 0.1737, "step": 1350, "task_loss": 0.39925694465637207 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12682190537452698, "epoch": 0.49, "learning_rate": 1.967067214271656e-06, "loss": 0.1698, "step": 1360, "task_loss": 0.3139961063861847 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17150625586509705, "epoch": 0.5, "learning_rate": 1.966583853385685e-06, "loss": 0.1768, "step": 1370, "task_loss": 0.6008814573287964 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1599908322095871, "epoch": 0.5, "learning_rate": 1.9660970313663583e-06, "loss": 0.171, "step": 1380, "task_loss": 0.4636422395706177 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14392361044883728, "epoch": 0.5, "learning_rate": 1.9656067499568826e-06, "loss": 0.1899, "step": 1390, "task_loss": 0.3466000258922577 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15924058854579926, "epoch": 0.51, "learning_rate": 1.965113010912853e-06, "loss": 0.1803, "step": 1400, "task_loss": 0.5084146857261658 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16537730395793915, "epoch": 0.51, "learning_rate": 1.964615816002244e-06, "loss": 0.1729, "step": 1410, "task_loss": 0.5999622344970703 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16629469394683838, "epoch": 0.51, "learning_rate": 1.9641151670054075e-06, "loss": 0.1845, "step": 1420, "task_loss": 0.32429054379463196 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16834649443626404, "epoch": 0.52, "learning_rate": 1.963611065715061e-06, "loss": 0.1705, "step": 1430, "task_loss": 0.21305644512176514 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1554267406463623, "epoch": 0.52, "learning_rate": 1.963103513936286e-06, "loss": 0.1904, "step": 1440, "task_loss": 0.22570659220218658 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20235656201839447, "epoch": 0.52, "learning_rate": 1.9625925134865174e-06, "loss": 0.1898, "step": 1450, "task_loss": 0.3236815333366394 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15887069702148438, "epoch": 0.53, "learning_rate": 1.9620780661955414e-06, "loss": 0.1798, "step": 1460, "task_loss": 0.6010942459106445 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15550808608531952, "epoch": 0.53, "learning_rate": 1.961560173905485e-06, "loss": 0.1817, "step": 1470, "task_loss": 0.18300172686576843 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1984330117702484, "epoch": 0.53, "learning_rate": 1.961038838470812e-06, "loss": 0.1697, "step": 1480, "task_loss": 0.5193182826042175 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15731069445610046, "epoch": 0.54, "learning_rate": 1.9605140617583136e-06, "loss": 0.1775, "step": 1490, "task_loss": 0.6436678171157837 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18328088521957397, "epoch": 0.54, "learning_rate": 1.959985845647106e-06, "loss": 0.1778, "step": 1500, "task_loss": 0.4055144190788269 }, { "epoch": 0.54, "eval_exact_match": 83.79375591296122, "eval_f1": 90.07183397891889, "step": 1500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.28984278440475464, "epoch": 0.55, "learning_rate": 1.95945419202862e-06, "loss": 0.1925, "step": 1510, "task_loss": 0.5853183269500732 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15279538929462433, "epoch": 0.55, "learning_rate": 1.9589191028065944e-06, "loss": 0.1724, "step": 1520, "task_loss": 0.1796664148569107 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1945275217294693, "epoch": 0.55, "learning_rate": 1.958380579897072e-06, "loss": 0.1913, "step": 1530, "task_loss": 1.093052625656128 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19740872085094452, "epoch": 0.56, "learning_rate": 1.9578386252283893e-06, "loss": 0.1837, "step": 1540, "task_loss": 0.2792867422103882 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16871120035648346, "epoch": 0.56, "learning_rate": 1.9572932407411715e-06, "loss": 0.1797, "step": 1550, "task_loss": 0.3847885727882385 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15976202487945557, "epoch": 0.56, "learning_rate": 1.9567444283883274e-06, "loss": 0.1712, "step": 1560, "task_loss": 0.4087778925895691 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17347773909568787, "epoch": 0.57, "learning_rate": 1.956192190135037e-06, "loss": 0.1796, "step": 1570, "task_loss": 0.6276683211326599 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1492072194814682, "epoch": 0.57, "learning_rate": 1.95563652795875e-06, "loss": 0.188, "step": 1580, "task_loss": 0.406479150056839 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15427610278129578, "epoch": 0.57, "learning_rate": 1.955077443849175e-06, "loss": 0.1677, "step": 1590, "task_loss": 0.21835781633853912 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17291224002838135, "epoch": 0.58, "learning_rate": 1.954514939808275e-06, "loss": 0.1772, "step": 1600, "task_loss": 0.5329791903495789 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17131741344928741, "epoch": 0.58, "learning_rate": 1.9539490178502587e-06, "loss": 0.1668, "step": 1610, "task_loss": 0.42593175172805786 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1412690430879593, "epoch": 0.59, "learning_rate": 1.9533796800015736e-06, "loss": 0.188, "step": 1620, "task_loss": 0.4295274317264557 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1931207925081253, "epoch": 0.59, "learning_rate": 1.952806928300898e-06, "loss": 0.1789, "step": 1630, "task_loss": 0.33001917600631714 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15779228508472443, "epoch": 0.59, "learning_rate": 1.9522307647991365e-06, "loss": 0.1735, "step": 1640, "task_loss": 0.35899198055267334 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17678138613700867, "epoch": 0.6, "learning_rate": 1.951651191559408e-06, "loss": 0.173, "step": 1650, "task_loss": 0.30126041173934937 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14263349771499634, "epoch": 0.6, "learning_rate": 1.951068210657043e-06, "loss": 0.1911, "step": 1660, "task_loss": 0.3900689482688904 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14301198720932007, "epoch": 0.6, "learning_rate": 1.9504818241795735e-06, "loss": 0.1635, "step": 1670, "task_loss": 0.27476924657821655 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1587420105934143, "epoch": 0.61, "learning_rate": 1.9498920342267256e-06, "loss": 0.177, "step": 1680, "task_loss": 0.5345589518547058 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18046513199806213, "epoch": 0.61, "learning_rate": 1.949298842910413e-06, "loss": 0.1811, "step": 1690, "task_loss": 0.5509105324745178 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11487637460231781, "epoch": 0.61, "learning_rate": 1.9487022523547296e-06, "loss": 0.1728, "step": 1700, "task_loss": 0.3565746545791626 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1578724980354309, "epoch": 0.62, "learning_rate": 1.9481022646959403e-06, "loss": 0.1859, "step": 1710, "task_loss": 0.42493292689323425 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2012399435043335, "epoch": 0.62, "learning_rate": 1.9474988820824743e-06, "loss": 0.1704, "step": 1720, "task_loss": 0.4968900978565216 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1466311812400818, "epoch": 0.63, "learning_rate": 1.946892106674918e-06, "loss": 0.1922, "step": 1730, "task_loss": 0.3440721035003662 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18941572308540344, "epoch": 0.63, "learning_rate": 1.9462819406460066e-06, "loss": 0.1898, "step": 1740, "task_loss": 0.7371132373809814 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14826852083206177, "epoch": 0.63, "learning_rate": 1.945668386180616e-06, "loss": 0.1803, "step": 1750, "task_loss": 0.34923413395881653 }, { "epoch": 0.63, "eval_exact_match": 83.75591296121098, "eval_f1": 90.08127134672708, "step": 1750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12791767716407776, "epoch": 0.64, "learning_rate": 1.9450514454757557e-06, "loss": 0.1798, "step": 1760, "task_loss": 0.26741665601730347 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20207276940345764, "epoch": 0.64, "learning_rate": 1.9444311207405607e-06, "loss": 0.1768, "step": 1770, "task_loss": 0.4358224868774414 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14697374403476715, "epoch": 0.64, "learning_rate": 1.943807414196283e-06, "loss": 0.1722, "step": 1780, "task_loss": 0.4396517276763916 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14546442031860352, "epoch": 0.65, "learning_rate": 1.9431803280762847e-06, "loss": 0.1789, "step": 1790, "task_loss": 0.4033772945404053 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18037843704223633, "epoch": 0.65, "learning_rate": 1.942549864626029e-06, "loss": 0.1731, "step": 1800, "task_loss": 0.17874035239219666 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1852273941040039, "epoch": 0.65, "learning_rate": 1.9419160261030732e-06, "loss": 0.1759, "step": 1810, "task_loss": 0.7803164720535278 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18808463215827942, "epoch": 0.66, "learning_rate": 1.941278814777059e-06, "loss": 0.1729, "step": 1820, "task_loss": 0.2709602415561676 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15545278787612915, "epoch": 0.66, "learning_rate": 1.940638232929707e-06, "loss": 0.1833, "step": 1830, "task_loss": 0.5003842115402222 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1838669627904892, "epoch": 0.66, "learning_rate": 1.939994282854805e-06, "loss": 0.1838, "step": 1840, "task_loss": 0.5018002986907959 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1540856808423996, "epoch": 0.67, "learning_rate": 1.9393469668582037e-06, "loss": 0.188, "step": 1850, "task_loss": 0.4272739291191101 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18242205679416656, "epoch": 0.67, "learning_rate": 1.9386962872578046e-06, "loss": 0.1845, "step": 1860, "task_loss": 0.6574127674102783 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16230648756027222, "epoch": 0.68, "learning_rate": 1.938042246383555e-06, "loss": 0.179, "step": 1870, "task_loss": 0.3625655174255371 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16737070679664612, "epoch": 0.68, "learning_rate": 1.9373848465774373e-06, "loss": 0.1707, "step": 1880, "task_loss": 0.3662741184234619 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14946278929710388, "epoch": 0.68, "learning_rate": 1.936724090193462e-06, "loss": 0.1815, "step": 1890, "task_loss": 0.4266752600669861 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14057648181915283, "epoch": 0.69, "learning_rate": 1.936059979597658e-06, "loss": 0.1713, "step": 1900, "task_loss": 0.19198694825172424 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13985538482666016, "epoch": 0.69, "learning_rate": 1.9353925171680666e-06, "loss": 0.1662, "step": 1910, "task_loss": 0.3392411172389984 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16503742337226868, "epoch": 0.69, "learning_rate": 1.93472170529473e-06, "loss": 0.1854, "step": 1920, "task_loss": 0.4105537533760071 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16417258977890015, "epoch": 0.7, "learning_rate": 1.9340475463796833e-06, "loss": 0.1803, "step": 1930, "task_loss": 0.3279024660587311 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1898116171360016, "epoch": 0.7, "learning_rate": 1.9333700428369494e-06, "loss": 0.1849, "step": 1940, "task_loss": 0.6425855755805969 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15912674367427826, "epoch": 0.7, "learning_rate": 1.9326891970925246e-06, "loss": 0.1801, "step": 1950, "task_loss": 0.3348599970340729 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22277340292930603, "epoch": 0.71, "learning_rate": 1.9320050115843748e-06, "loss": 0.1904, "step": 1960, "task_loss": 0.525826096534729 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19547489285469055, "epoch": 0.71, "learning_rate": 1.9313174887624245e-06, "loss": 0.1759, "step": 1970, "task_loss": 0.47217780351638794 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16027839481830597, "epoch": 0.72, "learning_rate": 1.930626631088548e-06, "loss": 0.1846, "step": 1980, "task_loss": 0.3707219362258911 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2344164252281189, "epoch": 0.72, "learning_rate": 1.9299324410365607e-06, "loss": 0.1822, "step": 1990, "task_loss": 0.44019412994384766 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1779266893863678, "epoch": 0.72, "learning_rate": 1.9292349210922114e-06, "loss": 0.1815, "step": 2000, "task_loss": 0.2988145351409912 }, { "epoch": 0.72, "eval_exact_match": 83.75591296121098, "eval_f1": 90.10693629002859, "step": 2000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19847017526626587, "epoch": 0.73, "learning_rate": 1.928534073753173e-06, "loss": 0.1785, "step": 2010, "task_loss": 0.42653709650039673 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15378426015377045, "epoch": 0.73, "learning_rate": 1.9278299015290313e-06, "loss": 0.1681, "step": 2020, "task_loss": 0.3308219909667969 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16970130801200867, "epoch": 0.73, "learning_rate": 1.9271224069412792e-06, "loss": 0.1797, "step": 2030, "task_loss": 0.5738540887832642 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1360698640346527, "epoch": 0.74, "learning_rate": 1.9264115925233063e-06, "loss": 0.1678, "step": 2040, "task_loss": 0.2731480300426483 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14180555939674377, "epoch": 0.74, "learning_rate": 1.925697460820389e-06, "loss": 0.1714, "step": 2050, "task_loss": 0.4451160132884979 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16470122337341309, "epoch": 0.74, "learning_rate": 1.9249800143896825e-06, "loss": 0.1865, "step": 2060, "task_loss": 0.9461302161216736 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13804659247398376, "epoch": 0.75, "learning_rate": 1.9242592558002116e-06, "loss": 0.1804, "step": 2070, "task_loss": 0.5417971611022949 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20538190007209778, "epoch": 0.75, "learning_rate": 1.9235351876328612e-06, "loss": 0.1716, "step": 2080, "task_loss": 0.468280553817749 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17718102037906647, "epoch": 0.76, "learning_rate": 1.9228078124803676e-06, "loss": 0.1694, "step": 2090, "task_loss": 0.4932321608066559 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1479235589504242, "epoch": 0.76, "learning_rate": 1.922077132947307e-06, "loss": 0.1831, "step": 2100, "task_loss": 0.3168169856071472 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22004127502441406, "epoch": 0.76, "learning_rate": 1.9213431516500902e-06, "loss": 0.1788, "step": 2110, "task_loss": 0.5075056552886963 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16012755036354065, "epoch": 0.77, "learning_rate": 1.920605871216949e-06, "loss": 0.1779, "step": 2120, "task_loss": 0.4837586581707001 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15686213970184326, "epoch": 0.77, "learning_rate": 1.919865294287929e-06, "loss": 0.1832, "step": 2130, "task_loss": 0.3587515950202942 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15825381875038147, "epoch": 0.77, "learning_rate": 1.919121423514882e-06, "loss": 0.1663, "step": 2140, "task_loss": 0.27808576822280884 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16749948263168335, "epoch": 0.78, "learning_rate": 1.918374261561451e-06, "loss": 0.1727, "step": 2150, "task_loss": 0.3993534445762634 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14597871899604797, "epoch": 0.78, "learning_rate": 1.9176238111030663e-06, "loss": 0.1674, "step": 2160, "task_loss": 0.2307223528623581 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15081357955932617, "epoch": 0.78, "learning_rate": 1.9168700748269336e-06, "loss": 0.1965, "step": 2170, "task_loss": 0.2585129141807556 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16626910865306854, "epoch": 0.79, "learning_rate": 1.916113055432023e-06, "loss": 0.1749, "step": 2180, "task_loss": 0.3819560408592224 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.161339670419693, "epoch": 0.79, "learning_rate": 1.915352755629062e-06, "loss": 0.1702, "step": 2190, "task_loss": 0.4928886592388153 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15861815214157104, "epoch": 0.8, "learning_rate": 1.9145891781405242e-06, "loss": 0.1737, "step": 2200, "task_loss": 0.4635201096534729 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16528362035751343, "epoch": 0.8, "learning_rate": 1.91382232570062e-06, "loss": 0.1802, "step": 2210, "task_loss": 0.5430650115013123 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13787183165550232, "epoch": 0.8, "learning_rate": 1.9130522010552868e-06, "loss": 0.1701, "step": 2220, "task_loss": 0.24549484252929688 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13275370001792908, "epoch": 0.81, "learning_rate": 1.9122788069621785e-06, "loss": 0.1586, "step": 2230, "task_loss": 0.3313102722167969 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21796873211860657, "epoch": 0.81, "learning_rate": 1.9115021461906563e-06, "loss": 0.1696, "step": 2240, "task_loss": 0.6265380382537842 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1380050778388977, "epoch": 0.81, "learning_rate": 1.9107222215217797e-06, "loss": 0.1662, "step": 2250, "task_loss": 0.4539833068847656 }, { "epoch": 0.81, "eval_exact_match": 83.8221381267739, "eval_f1": 90.09116234913313, "step": 2250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1531752347946167, "epoch": 0.82, "learning_rate": 1.9099390357482943e-06, "loss": 0.1697, "step": 2260, "task_loss": 0.1812114119529724 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19936254620552063, "epoch": 0.82, "learning_rate": 1.9091525916746236e-06, "loss": 0.1869, "step": 2270, "task_loss": 0.382361501455307 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18323323130607605, "epoch": 0.82, "learning_rate": 1.9083628921168582e-06, "loss": 0.174, "step": 2280, "task_loss": 0.30770325660705566 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1802387237548828, "epoch": 0.83, "learning_rate": 1.9075699399027466e-06, "loss": 0.1861, "step": 2290, "task_loss": 0.521061897277832 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18234525620937347, "epoch": 0.83, "learning_rate": 1.9067737378716833e-06, "loss": 0.2008, "step": 2300, "task_loss": 0.2811727225780487 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16019967198371887, "epoch": 0.83, "learning_rate": 1.9059742888747002e-06, "loss": 0.1948, "step": 2310, "task_loss": 0.4444176256656647 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15703542530536652, "epoch": 0.84, "learning_rate": 1.9051715957744562e-06, "loss": 0.1685, "step": 2320, "task_loss": 0.3662908673286438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14924360811710358, "epoch": 0.84, "learning_rate": 1.9043656614452257e-06, "loss": 0.18, "step": 2330, "task_loss": 0.32921046018600464 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15606483817100525, "epoch": 0.85, "learning_rate": 1.9035564887728907e-06, "loss": 0.1725, "step": 2340, "task_loss": 0.5169017910957336 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16825850307941437, "epoch": 0.85, "learning_rate": 1.902744080654928e-06, "loss": 0.1805, "step": 2350, "task_loss": 0.493346244096756 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22923272848129272, "epoch": 0.85, "learning_rate": 1.9019284400003998e-06, "loss": 0.1885, "step": 2360, "task_loss": 0.4266737997531891 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18409624695777893, "epoch": 0.86, "learning_rate": 1.901109569729944e-06, "loss": 0.1706, "step": 2370, "task_loss": 0.2116602510213852 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1556832194328308, "epoch": 0.86, "learning_rate": 1.9002874727757627e-06, "loss": 0.1654, "step": 2380, "task_loss": 0.324906587600708 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21795554459095, "epoch": 0.86, "learning_rate": 1.899462152081612e-06, "loss": 0.1917, "step": 2390, "task_loss": 0.5224672555923462 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1452036201953888, "epoch": 0.87, "learning_rate": 1.898633610602791e-06, "loss": 0.1783, "step": 2400, "task_loss": 0.29410141706466675 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14028596878051758, "epoch": 0.87, "learning_rate": 1.8978018513061333e-06, "loss": 0.1796, "step": 2410, "task_loss": 0.22278541326522827 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13686451315879822, "epoch": 0.87, "learning_rate": 1.8969668771699936e-06, "loss": 0.1592, "step": 2420, "task_loss": 0.2901807129383087 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17762663960456848, "epoch": 0.88, "learning_rate": 1.8961286911842385e-06, "loss": 0.18, "step": 2430, "task_loss": 0.6580682992935181 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15839552879333496, "epoch": 0.88, "learning_rate": 1.8952872963502354e-06, "loss": 0.1748, "step": 2440, "task_loss": 0.33616119623184204 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1614687591791153, "epoch": 0.89, "learning_rate": 1.8944426956808423e-06, "loss": 0.1668, "step": 2450, "task_loss": 0.3434739112854004 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1861266791820526, "epoch": 0.89, "learning_rate": 1.8935948922003964e-06, "loss": 0.1747, "step": 2460, "task_loss": 0.5159826278686523 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19838041067123413, "epoch": 0.89, "learning_rate": 1.8927438889447037e-06, "loss": 0.1775, "step": 2470, "task_loss": 0.4625704288482666 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14603759348392487, "epoch": 0.9, "learning_rate": 1.8918896889610276e-06, "loss": 0.1915, "step": 2480, "task_loss": 0.7576199769973755 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16346848011016846, "epoch": 0.9, "learning_rate": 1.8910322953080787e-06, "loss": 0.1776, "step": 2490, "task_loss": 0.4902191162109375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14848199486732483, "epoch": 0.9, "learning_rate": 1.890171711056003e-06, "loss": 0.1751, "step": 2500, "task_loss": 0.4773310720920563 }, { "epoch": 0.9, "eval_exact_match": 83.52885525070955, "eval_f1": 90.03965181607728, "step": 2500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1388649195432663, "epoch": 0.91, "learning_rate": 1.8893079392863714e-06, "loss": 0.1646, "step": 2510, "task_loss": 0.3831629157066345 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13925893604755402, "epoch": 0.91, "learning_rate": 1.8884409830921692e-06, "loss": 0.1754, "step": 2520, "task_loss": 0.4401072859764099 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20444272458553314, "epoch": 0.91, "learning_rate": 1.887570845577784e-06, "loss": 0.1794, "step": 2530, "task_loss": 0.545073390007019 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15069428086280823, "epoch": 0.92, "learning_rate": 1.8866975298589949e-06, "loss": 0.183, "step": 2540, "task_loss": 0.3420984745025635 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1433373987674713, "epoch": 0.92, "learning_rate": 1.885821039062962e-06, "loss": 0.1709, "step": 2550, "task_loss": 0.3048064112663269 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14648103713989258, "epoch": 0.93, "learning_rate": 1.8849413763282144e-06, "loss": 0.1836, "step": 2560, "task_loss": 0.4393615424633026 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1583467423915863, "epoch": 0.93, "learning_rate": 1.8840585448046386e-06, "loss": 0.1746, "step": 2570, "task_loss": 0.5558731555938721 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1908012479543686, "epoch": 0.93, "learning_rate": 1.8831725476534693e-06, "loss": 0.1818, "step": 2580, "task_loss": 0.3851405382156372 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15493538975715637, "epoch": 0.94, "learning_rate": 1.882283388047275e-06, "loss": 0.1796, "step": 2590, "task_loss": 0.4332561492919922 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18247196078300476, "epoch": 0.94, "learning_rate": 1.88139106916995e-06, "loss": 0.1842, "step": 2600, "task_loss": 0.3662012219429016 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16121315956115723, "epoch": 0.94, "learning_rate": 1.8804955942167e-06, "loss": 0.1717, "step": 2610, "task_loss": 0.7887633442878723 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13026580214500427, "epoch": 0.95, "learning_rate": 1.879596966394032e-06, "loss": 0.1763, "step": 2620, "task_loss": 0.3799874186515808 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1766907274723053, "epoch": 0.95, "learning_rate": 1.8786951889197438e-06, "loss": 0.178, "step": 2630, "task_loss": 0.4739248752593994 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1579083502292633, "epoch": 0.95, "learning_rate": 1.8777902650229103e-06, "loss": 0.1818, "step": 2640, "task_loss": 0.5713690519332886 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19626328349113464, "epoch": 0.96, "learning_rate": 1.8768821979438739e-06, "loss": 0.1771, "step": 2650, "task_loss": 0.3688851594924927 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15492022037506104, "epoch": 0.96, "learning_rate": 1.875970990934231e-06, "loss": 0.1747, "step": 2660, "task_loss": 0.36857369542121887 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21618108451366425, "epoch": 0.96, "learning_rate": 1.875056647256823e-06, "loss": 0.1856, "step": 2670, "task_loss": 0.5106911659240723 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13856954872608185, "epoch": 0.97, "learning_rate": 1.8741391701857215e-06, "loss": 0.1816, "step": 2680, "task_loss": 0.3294844627380371 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14490175247192383, "epoch": 0.97, "learning_rate": 1.873218563006219e-06, "loss": 0.1729, "step": 2690, "task_loss": 0.3456054627895355 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.142696350812912, "epoch": 0.98, "learning_rate": 1.8722948290148161e-06, "loss": 0.1744, "step": 2700, "task_loss": 0.3695463538169861 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2125137597322464, "epoch": 0.98, "learning_rate": 1.8713679715192102e-06, "loss": 0.1904, "step": 2710, "task_loss": 0.5505295991897583 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1854153871536255, "epoch": 0.98, "learning_rate": 1.8704379938382822e-06, "loss": 0.1877, "step": 2720, "task_loss": 0.4623679220676422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1481424868106842, "epoch": 0.99, "learning_rate": 1.869504899302087e-06, "loss": 0.1965, "step": 2730, "task_loss": 0.26727068424224854 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16843540966510773, "epoch": 0.99, "learning_rate": 1.8685686912518394e-06, "loss": 0.1715, "step": 2740, "task_loss": 0.4420323967933655 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15341581404209137, "epoch": 0.99, "learning_rate": 1.8676293730399038e-06, "loss": 0.1783, "step": 2750, "task_loss": 0.701764702796936 }, { "epoch": 0.99, "eval_exact_match": 83.72753074739829, "eval_f1": 90.08435171358782, "step": 2750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19545786082744598, "epoch": 1.0, "learning_rate": 1.8666869480297808e-06, "loss": 0.1802, "step": 2760, "task_loss": 0.30919766426086426 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15790414810180664, "epoch": 1.0, "learning_rate": 1.8657414195960958e-06, "loss": 0.164, "step": 2770, "task_loss": 0.6185303926467896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18240132927894592, "epoch": 1.0, "learning_rate": 1.8647927911245875e-06, "loss": 0.1829, "step": 2780, "task_loss": 0.4996330142021179 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17760756611824036, "epoch": 1.01, "learning_rate": 1.8638410660120947e-06, "loss": 0.1736, "step": 2790, "task_loss": 0.34335148334503174 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14696209132671356, "epoch": 1.01, "learning_rate": 1.8628862476665448e-06, "loss": 0.1828, "step": 2800, "task_loss": 0.42722922563552856 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15718209743499756, "epoch": 1.02, "learning_rate": 1.8619283395069409e-06, "loss": 0.1764, "step": 2810, "task_loss": 0.35283511877059937 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15315774083137512, "epoch": 1.02, "learning_rate": 1.8609673449633513e-06, "loss": 0.1858, "step": 2820, "task_loss": 0.38201355934143066 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1411234736442566, "epoch": 1.02, "learning_rate": 1.8600032674768947e-06, "loss": 0.1745, "step": 2830, "task_loss": 0.5564037561416626 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14134261012077332, "epoch": 1.03, "learning_rate": 1.8590361104997298e-06, "loss": 0.1873, "step": 2840, "task_loss": 0.28306859731674194 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15581990778446198, "epoch": 1.03, "learning_rate": 1.858065877495042e-06, "loss": 0.1664, "step": 2850, "task_loss": 0.5072811245918274 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1485014408826828, "epoch": 1.03, "learning_rate": 1.857092571937032e-06, "loss": 0.1805, "step": 2860, "task_loss": 0.48709845542907715 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16704809665679932, "epoch": 1.04, "learning_rate": 1.8561161973109014e-06, "loss": 0.1867, "step": 2870, "task_loss": 0.6069858074188232 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.148666113615036, "epoch": 1.04, "learning_rate": 1.8551367571128429e-06, "loss": 0.1896, "step": 2880, "task_loss": 0.3557353913784027 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16260361671447754, "epoch": 1.04, "learning_rate": 1.8541542548500256e-06, "loss": 0.1756, "step": 2890, "task_loss": 0.416062593460083 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1194634884595871, "epoch": 1.05, "learning_rate": 1.853168694040583e-06, "loss": 0.1718, "step": 2900, "task_loss": 0.17556090652942657 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1434130072593689, "epoch": 1.05, "learning_rate": 1.8521800782136014e-06, "loss": 0.1679, "step": 2910, "task_loss": 0.4673941135406494 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21220733225345612, "epoch": 1.06, "learning_rate": 1.851188410909106e-06, "loss": 0.1763, "step": 2920, "task_loss": 0.562555730342865 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1408117711544037, "epoch": 1.06, "learning_rate": 1.850193695678048e-06, "loss": 0.1688, "step": 2930, "task_loss": 0.3450365662574768 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15347136557102203, "epoch": 1.06, "learning_rate": 1.8491959360822938e-06, "loss": 0.1821, "step": 2940, "task_loss": 0.5536178350448608 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18112389743328094, "epoch": 1.07, "learning_rate": 1.848195135694611e-06, "loss": 0.182, "step": 2950, "task_loss": 0.5740102529525757 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15279173851013184, "epoch": 1.07, "learning_rate": 1.8471912980986544e-06, "loss": 0.1787, "step": 2960, "task_loss": 0.16615566611289978 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16220858693122864, "epoch": 1.07, "learning_rate": 1.846285250419875e-06, "loss": 0.1767, "step": 2970, "task_loss": 0.47266218066215515 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18561126291751862, "epoch": 1.08, "learning_rate": 1.8452756520401107e-06, "loss": 0.1785, "step": 2980, "task_loss": 0.4946807622909546 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14157789945602417, "epoch": 1.08, "learning_rate": 1.8442630269061292e-06, "loss": 0.1778, "step": 2990, "task_loss": 0.34104907512664795 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11815983057022095, "epoch": 1.08, "learning_rate": 1.8432473786439283e-06, "loss": 0.1527, "step": 3000, "task_loss": 0.1873714029788971 }, { "epoch": 1.08, "eval_exact_match": 83.44370860927152, "eval_f1": 89.90493527722583, "step": 3000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16796885430812836, "epoch": 1.09, "learning_rate": 1.8422287108903304e-06, "loss": 0.1745, "step": 3010, "task_loss": 0.2368674874305725 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16812004148960114, "epoch": 1.09, "learning_rate": 1.841207027292971e-06, "loss": 0.1866, "step": 3020, "task_loss": 0.6989672183990479 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16153055429458618, "epoch": 1.1, "learning_rate": 1.8401823315102833e-06, "loss": 0.178, "step": 3030, "task_loss": 0.43721848726272583 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12805390357971191, "epoch": 1.1, "learning_rate": 1.8391546272114878e-06, "loss": 0.1755, "step": 3040, "task_loss": 0.2398868352174759 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1456378549337387, "epoch": 1.1, "learning_rate": 1.8381239180765768e-06, "loss": 0.1684, "step": 3050, "task_loss": 0.5672136545181274 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15030169486999512, "epoch": 1.11, "learning_rate": 1.837090207796303e-06, "loss": 0.1841, "step": 3060, "task_loss": 0.4073163568973541 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1696191281080246, "epoch": 1.11, "learning_rate": 1.8360535000721655e-06, "loss": 0.1798, "step": 3070, "task_loss": 0.5377500057220459 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18132445216178894, "epoch": 1.11, "learning_rate": 1.8350137986163965e-06, "loss": 0.1859, "step": 3080, "task_loss": 0.5428669452667236 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12620098888874054, "epoch": 1.12, "learning_rate": 1.8339711071519482e-06, "loss": 0.1726, "step": 3090, "task_loss": 0.36496835947036743 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12938401103019714, "epoch": 1.12, "learning_rate": 1.8329254294124787e-06, "loss": 0.1684, "step": 3100, "task_loss": 0.3683048486709595 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17861530184745789, "epoch": 1.12, "learning_rate": 1.8318767691423402e-06, "loss": 0.1882, "step": 3110, "task_loss": 0.43564486503601074 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1419188380241394, "epoch": 1.13, "learning_rate": 1.830825130096565e-06, "loss": 0.1769, "step": 3120, "task_loss": 0.5064725875854492 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18351003527641296, "epoch": 1.13, "learning_rate": 1.8297705160408503e-06, "loss": 0.1623, "step": 3130, "task_loss": 0.4635063409805298 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1454399824142456, "epoch": 1.13, "learning_rate": 1.8287129307515478e-06, "loss": 0.1687, "step": 3140, "task_loss": 0.3754516839981079 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1441243588924408, "epoch": 1.14, "learning_rate": 1.8276523780156474e-06, "loss": 0.1763, "step": 3150, "task_loss": 0.3571741580963135 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15150251984596252, "epoch": 1.14, "learning_rate": 1.8265888616307657e-06, "loss": 0.1837, "step": 3160, "task_loss": 0.34464773535728455 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13545379042625427, "epoch": 1.15, "learning_rate": 1.8255223854051305e-06, "loss": 0.1715, "step": 3170, "task_loss": 0.3557414412498474 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1452762484550476, "epoch": 1.15, "learning_rate": 1.824452953157569e-06, "loss": 0.1874, "step": 3180, "task_loss": 0.3559718728065491 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14096008241176605, "epoch": 1.15, "learning_rate": 1.823380568717493e-06, "loss": 0.1848, "step": 3190, "task_loss": 0.4971044063568115 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2189292013645172, "epoch": 1.16, "learning_rate": 1.8223052359248854e-06, "loss": 0.1742, "step": 3200, "task_loss": 0.3672422766685486 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1771106719970703, "epoch": 1.16, "learning_rate": 1.821226958630287e-06, "loss": 0.1686, "step": 3210, "task_loss": 0.43500882387161255 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12642604112625122, "epoch": 1.16, "learning_rate": 1.8201457406947814e-06, "loss": 0.166, "step": 3220, "task_loss": 0.1765107810497284 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16896989941596985, "epoch": 1.17, "learning_rate": 1.8190615859899824e-06, "loss": 0.1816, "step": 3230, "task_loss": 0.4253600835800171 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18833866715431213, "epoch": 1.17, "learning_rate": 1.8179744983980206e-06, "loss": 0.1822, "step": 3240, "task_loss": 0.48154908418655396 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21106314659118652, "epoch": 1.17, "learning_rate": 1.816884481811527e-06, "loss": 0.1856, "step": 3250, "task_loss": 0.4936971664428711 }, { "epoch": 1.17, "eval_exact_match": 83.6329233680227, "eval_f1": 89.95397066155324, "step": 3250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13803541660308838, "epoch": 1.18, "learning_rate": 1.8157915401336218e-06, "loss": 0.1726, "step": 3260, "task_loss": 0.18990027904510498 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19129733741283417, "epoch": 1.18, "learning_rate": 1.8148053949047202e-06, "loss": 0.1913, "step": 3270, "task_loss": 0.3831537961959839 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15445619821548462, "epoch": 1.19, "learning_rate": 1.8137069063437304e-06, "loss": 0.1686, "step": 3280, "task_loss": 0.43404531478881836 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18276625871658325, "epoch": 1.19, "learning_rate": 1.8126055040695588e-06, "loss": 0.1867, "step": 3290, "task_loss": 0.5082448720932007 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16971507668495178, "epoch": 1.19, "learning_rate": 1.8115011920260946e-06, "loss": 0.1896, "step": 3300, "task_loss": 0.4748695194721222 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15451842546463013, "epoch": 1.2, "learning_rate": 1.8103939741676465e-06, "loss": 0.1619, "step": 3310, "task_loss": 0.5286293029785156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16968689858913422, "epoch": 1.2, "learning_rate": 1.8092838544589287e-06, "loss": 0.1662, "step": 3320, "task_loss": 0.19997572898864746 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1290392279624939, "epoch": 1.2, "learning_rate": 1.8081708368750466e-06, "loss": 0.1628, "step": 3330, "task_loss": 0.3563808798789978 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12694013118743896, "epoch": 1.21, "learning_rate": 1.8070549254014816e-06, "loss": 0.1722, "step": 3340, "task_loss": 0.3265552818775177 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1317066103219986, "epoch": 1.21, "learning_rate": 1.8059361240340782e-06, "loss": 0.1568, "step": 3350, "task_loss": 0.26479342579841614 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14310331642627716, "epoch": 1.21, "learning_rate": 1.8048144367790284e-06, "loss": 0.1919, "step": 3360, "task_loss": 0.3072636127471924 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15124750137329102, "epoch": 1.22, "learning_rate": 1.803689867652858e-06, "loss": 0.1826, "step": 3370, "task_loss": 0.5191828608512878 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15719471871852875, "epoch": 1.22, "learning_rate": 1.802562420682413e-06, "loss": 0.1727, "step": 3380, "task_loss": 0.3152187466621399 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1399240493774414, "epoch": 1.23, "learning_rate": 1.8014320999048426e-06, "loss": 0.1718, "step": 3390, "task_loss": 0.2820362448692322 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16205359995365143, "epoch": 1.23, "learning_rate": 1.800298909367589e-06, "loss": 0.1843, "step": 3400, "task_loss": 0.36751991510391235 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1521889865398407, "epoch": 1.23, "learning_rate": 1.799162853128368e-06, "loss": 0.1557, "step": 3410, "task_loss": 0.3483598828315735 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1441453993320465, "epoch": 1.24, "learning_rate": 1.7980239352551582e-06, "loss": 0.1681, "step": 3420, "task_loss": 0.5732893943786621 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1875881552696228, "epoch": 1.24, "learning_rate": 1.7968821598261852e-06, "loss": 0.1703, "step": 3430, "task_loss": 0.37429314851760864 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.23559153079986572, "epoch": 1.24, "learning_rate": 1.7957375309299058e-06, "loss": 0.177, "step": 3440, "task_loss": 0.503436803817749 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15299516916275024, "epoch": 1.25, "learning_rate": 1.7945900526649957e-06, "loss": 0.1775, "step": 3450, "task_loss": 0.34383100271224976 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11944176256656647, "epoch": 1.25, "learning_rate": 1.793439729140333e-06, "loss": 0.1805, "step": 3460, "task_loss": 0.1734773814678192 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1728697270154953, "epoch": 1.25, "learning_rate": 1.7922865644749843e-06, "loss": 0.167, "step": 3470, "task_loss": 0.39709553122520447 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1935918629169464, "epoch": 1.26, "learning_rate": 1.7911305627981892e-06, "loss": 0.1799, "step": 3480, "task_loss": 0.4305168092250824 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14809994399547577, "epoch": 1.26, "learning_rate": 1.7899717282493463e-06, "loss": 0.1799, "step": 3490, "task_loss": 0.5099273920059204 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15830612182617188, "epoch": 1.26, "learning_rate": 1.7888100649779986e-06, "loss": 0.1738, "step": 3500, "task_loss": 0.38358789682388306 }, { "epoch": 1.26, "eval_exact_match": 83.50047303689688, "eval_f1": 89.96018049046944, "step": 3500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15528270602226257, "epoch": 1.27, "learning_rate": 1.7876455771438178e-06, "loss": 0.1765, "step": 3510, "task_loss": 0.24267150461673737 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14513492584228516, "epoch": 1.27, "learning_rate": 1.7864782689165901e-06, "loss": 0.1703, "step": 3520, "task_loss": 0.551958441734314 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13601109385490417, "epoch": 1.28, "learning_rate": 1.7853081444761998e-06, "loss": 0.1683, "step": 3530, "task_loss": 0.2686072289943695 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13815787434577942, "epoch": 1.28, "learning_rate": 1.7841352080126164e-06, "loss": 0.1823, "step": 3540, "task_loss": 0.5631309747695923 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18684223294258118, "epoch": 1.28, "learning_rate": 1.7829594637258792e-06, "loss": 0.1765, "step": 3550, "task_loss": 0.40622183680534363 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16199585795402527, "epoch": 1.29, "learning_rate": 1.7817809158260805e-06, "loss": 0.1833, "step": 3560, "task_loss": 0.537696897983551 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13808155059814453, "epoch": 1.29, "learning_rate": 1.7805995685333524e-06, "loss": 0.1705, "step": 3570, "task_loss": 0.26295095682144165 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21583780646324158, "epoch": 1.29, "learning_rate": 1.7794154260778507e-06, "loss": 0.1735, "step": 3580, "task_loss": 0.4579695463180542 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22117865085601807, "epoch": 1.3, "learning_rate": 1.778228492699741e-06, "loss": 0.1788, "step": 3590, "task_loss": 0.6541140079498291 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17737066745758057, "epoch": 1.3, "learning_rate": 1.7770387726491812e-06, "loss": 0.1802, "step": 3600, "task_loss": 0.3846840262413025 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18559321761131287, "epoch": 1.3, "learning_rate": 1.7758462701863084e-06, "loss": 0.1833, "step": 3610, "task_loss": 0.43720221519470215 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15115341544151306, "epoch": 1.31, "learning_rate": 1.7746509895812238e-06, "loss": 0.1638, "step": 3620, "task_loss": 0.44333165884017944 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15191200375556946, "epoch": 1.31, "learning_rate": 1.773452935113975e-06, "loss": 0.1797, "step": 3630, "task_loss": 0.5943065881729126 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1316663920879364, "epoch": 1.32, "learning_rate": 1.7722521110745427e-06, "loss": 0.1579, "step": 3640, "task_loss": 0.30335086584091187 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14825601875782013, "epoch": 1.32, "learning_rate": 1.7710485217628262e-06, "loss": 0.1763, "step": 3650, "task_loss": 0.2760236859321594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1533227413892746, "epoch": 1.32, "learning_rate": 1.7698421714886243e-06, "loss": 0.1824, "step": 3660, "task_loss": 0.2671685218811035 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16644388437271118, "epoch": 1.33, "learning_rate": 1.768633064571624e-06, "loss": 0.1936, "step": 3670, "task_loss": 0.49052125215530396 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18509739637374878, "epoch": 1.33, "learning_rate": 1.7674212053413822e-06, "loss": 0.1649, "step": 3680, "task_loss": 0.2983229160308838 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14358261227607727, "epoch": 1.33, "learning_rate": 1.7662065981373124e-06, "loss": 0.1789, "step": 3690, "task_loss": 0.28044524788856506 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19266170263290405, "epoch": 1.34, "learning_rate": 1.7649892473086674e-06, "loss": 0.1825, "step": 3700, "task_loss": 0.2881600260734558 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15574830770492554, "epoch": 1.34, "learning_rate": 1.763769157214524e-06, "loss": 0.1734, "step": 3710, "task_loss": 0.33556947112083435 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12162809073925018, "epoch": 1.34, "learning_rate": 1.7625463322237679e-06, "loss": 0.1706, "step": 3720, "task_loss": 0.37808844447135925 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18673118948936462, "epoch": 1.35, "learning_rate": 1.7613207767150783e-06, "loss": 0.1778, "step": 3730, "task_loss": 0.536344587802887 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12695255875587463, "epoch": 1.35, "learning_rate": 1.7600924950769117e-06, "loss": 0.1722, "step": 3740, "task_loss": 0.6155173778533936 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14430446922779083, "epoch": 1.36, "learning_rate": 1.758861491707486e-06, "loss": 0.1741, "step": 3750, "task_loss": 0.35061925649642944 }, { "epoch": 1.36, "eval_exact_match": 83.4720908230842, "eval_f1": 89.95138805784273, "step": 3750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1242603063583374, "epoch": 1.36, "learning_rate": 1.757627771014765e-06, "loss": 0.1656, "step": 3760, "task_loss": 0.42466431856155396 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1508714258670807, "epoch": 1.36, "learning_rate": 1.756391337416443e-06, "loss": 0.1697, "step": 3770, "task_loss": 0.4327700436115265 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1856732964515686, "epoch": 1.37, "learning_rate": 1.7551521953399286e-06, "loss": 0.1926, "step": 3780, "task_loss": 0.5305187702178955 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1641940474510193, "epoch": 1.37, "learning_rate": 1.7539103492223286e-06, "loss": 0.164, "step": 3790, "task_loss": 0.3325423002243042 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15839411318302155, "epoch": 1.37, "learning_rate": 1.752665803510433e-06, "loss": 0.1747, "step": 3800, "task_loss": 0.4586430788040161 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12557630240917206, "epoch": 1.38, "learning_rate": 1.7514185626606972e-06, "loss": 0.1589, "step": 3810, "task_loss": 0.3240758180618286 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1498473584651947, "epoch": 1.38, "learning_rate": 1.7501686311392292e-06, "loss": 0.1709, "step": 3820, "task_loss": 0.33482322096824646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14660826325416565, "epoch": 1.38, "learning_rate": 1.7489160134217702e-06, "loss": 0.1773, "step": 3830, "task_loss": 0.3380383253097534 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17524400353431702, "epoch": 1.39, "learning_rate": 1.7476607139936807e-06, "loss": 0.1801, "step": 3840, "task_loss": 0.5500714182853699 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18646281957626343, "epoch": 1.39, "learning_rate": 1.746402737349924e-06, "loss": 0.1829, "step": 3850, "task_loss": 0.4430808126926422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18419930338859558, "epoch": 1.4, "learning_rate": 1.7451420879950491e-06, "loss": 0.1766, "step": 3860, "task_loss": 0.699094295501709 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1590745896100998, "epoch": 1.4, "learning_rate": 1.7438787704431765e-06, "loss": 0.1794, "step": 3870, "task_loss": 0.5042922496795654 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16868017613887787, "epoch": 1.4, "learning_rate": 1.7426127892179805e-06, "loss": 0.1618, "step": 3880, "task_loss": 0.6242218613624573 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12290704250335693, "epoch": 1.41, "learning_rate": 1.7413441488526734e-06, "loss": 0.1671, "step": 3890, "task_loss": 0.33300209045410156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1805894672870636, "epoch": 1.41, "learning_rate": 1.7400728538899893e-06, "loss": 0.1794, "step": 3900, "task_loss": 0.4865798056125641 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13778156042099, "epoch": 1.41, "learning_rate": 1.7387989088821677e-06, "loss": 0.1745, "step": 3910, "task_loss": 0.36040520668029785 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18057343363761902, "epoch": 1.42, "learning_rate": 1.7375223183909378e-06, "loss": 0.1818, "step": 3920, "task_loss": 0.43555018305778503 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16372817754745483, "epoch": 1.42, "learning_rate": 1.7362430869875017e-06, "loss": 0.1622, "step": 3930, "task_loss": 0.3211020827293396 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18226927518844604, "epoch": 1.42, "learning_rate": 1.7349612192525176e-06, "loss": 0.1748, "step": 3940, "task_loss": 0.39155834913253784 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17144691944122314, "epoch": 1.43, "learning_rate": 1.7336767197760837e-06, "loss": 0.1634, "step": 3950, "task_loss": 0.3620833158493042 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17456424236297607, "epoch": 1.43, "learning_rate": 1.7323895931577228e-06, "loss": 0.1813, "step": 3960, "task_loss": 0.5210827589035034 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1263851523399353, "epoch": 1.43, "learning_rate": 1.7310998440063647e-06, "loss": 0.1715, "step": 3970, "task_loss": 0.5859290361404419 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.129383385181427, "epoch": 1.44, "learning_rate": 1.7298074769403285e-06, "loss": 0.1638, "step": 3980, "task_loss": 0.31362149119377136 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.23324280977249146, "epoch": 1.44, "learning_rate": 1.72851249658731e-06, "loss": 0.1804, "step": 3990, "task_loss": 0.451979398727417 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15351605415344238, "epoch": 1.45, "learning_rate": 1.727214907584361e-06, "loss": 0.1647, "step": 4000, "task_loss": 0.33222532272338867 }, { "epoch": 1.45, "eval_exact_match": 83.66130558183538, "eval_f1": 90.1062476420109, "step": 4000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16170555353164673, "epoch": 1.45, "learning_rate": 1.725914714577874e-06, "loss": 0.1619, "step": 4010, "task_loss": 0.5656744837760925 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14653730392456055, "epoch": 1.45, "learning_rate": 1.724611922223567e-06, "loss": 0.172, "step": 4020, "task_loss": 0.5113309621810913 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13656997680664062, "epoch": 1.46, "learning_rate": 1.7233065351864652e-06, "loss": 0.171, "step": 4030, "task_loss": 0.24282409250736237 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13946852087974548, "epoch": 1.46, "learning_rate": 1.7219985581408847e-06, "loss": 0.1782, "step": 4040, "task_loss": 0.6446897983551025 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.27610087394714355, "epoch": 1.46, "learning_rate": 1.7206879957704161e-06, "loss": 0.1864, "step": 4050, "task_loss": 0.7376024127006531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20331989228725433, "epoch": 1.47, "learning_rate": 1.7193748527679074e-06, "loss": 0.1816, "step": 4060, "task_loss": 0.6629816293716431 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15988487005233765, "epoch": 1.47, "learning_rate": 1.7180591338354479e-06, "loss": 0.1694, "step": 4070, "task_loss": 0.46234872937202454 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1611352115869522, "epoch": 1.47, "learning_rate": 1.7167408436843493e-06, "loss": 0.165, "step": 4080, "task_loss": 0.4623450040817261 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15372850000858307, "epoch": 1.48, "learning_rate": 1.7154199870351319e-06, "loss": 0.1701, "step": 4090, "task_loss": 0.3854554295539856 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12263274192810059, "epoch": 1.48, "learning_rate": 1.7140965686175047e-06, "loss": 0.1648, "step": 4100, "task_loss": 0.3795510530471802 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14757771790027618, "epoch": 1.49, "learning_rate": 1.7127705931703511e-06, "loss": 0.1677, "step": 4110, "task_loss": 0.5079241991043091 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19403065741062164, "epoch": 1.49, "learning_rate": 1.7114420654417102e-06, "loss": 0.1749, "step": 4120, "task_loss": 0.6132655143737793 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17154933512210846, "epoch": 1.49, "learning_rate": 1.7101109901887594e-06, "loss": 0.1698, "step": 4130, "task_loss": 0.5632673501968384 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15782007575035095, "epoch": 1.5, "learning_rate": 1.7087773721777998e-06, "loss": 0.1682, "step": 4140, "task_loss": 0.29379528760910034 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15191909670829773, "epoch": 1.5, "learning_rate": 1.7074412161842368e-06, "loss": 0.1821, "step": 4150, "task_loss": 0.40577489137649536 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1590431034564972, "epoch": 1.5, "learning_rate": 1.7061025269925633e-06, "loss": 0.1725, "step": 4160, "task_loss": 0.3673279285430908 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15804147720336914, "epoch": 1.51, "learning_rate": 1.704761309396344e-06, "loss": 0.1744, "step": 4170, "task_loss": 0.3593347668647766 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22128893435001373, "epoch": 1.51, "learning_rate": 1.7034175681981969e-06, "loss": 0.1745, "step": 4180, "task_loss": 0.3887554407119751 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17814043164253235, "epoch": 1.51, "learning_rate": 1.702071308209776e-06, "loss": 0.1842, "step": 4190, "task_loss": 0.37278926372528076 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1720452755689621, "epoch": 1.52, "learning_rate": 1.7007225342517554e-06, "loss": 0.1618, "step": 4200, "task_loss": 0.3401908576488495 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18010592460632324, "epoch": 1.52, "learning_rate": 1.6993712511538108e-06, "loss": 0.1714, "step": 4210, "task_loss": 0.35429495573043823 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14852438867092133, "epoch": 1.53, "learning_rate": 1.6980174637546022e-06, "loss": 0.176, "step": 4220, "task_loss": 0.2848626971244812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12181121110916138, "epoch": 1.53, "learning_rate": 1.6966611769017574e-06, "loss": 0.1731, "step": 4230, "task_loss": 0.234962597489357 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17690081894397736, "epoch": 1.53, "learning_rate": 1.6953023954518546e-06, "loss": 0.1816, "step": 4240, "task_loss": 0.3170431852340698 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1582062691450119, "epoch": 1.54, "learning_rate": 1.6939411242704037e-06, "loss": 0.1727, "step": 4250, "task_loss": 0.6310293674468994 }, { "epoch": 1.54, "eval_exact_match": 83.66130558183538, "eval_f1": 90.01785833074021, "step": 4250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13817504048347473, "epoch": 1.54, "learning_rate": 1.6925773682318312e-06, "loss": 0.168, "step": 4260, "task_loss": 0.29341834783554077 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18749260902404785, "epoch": 1.54, "learning_rate": 1.6912111322194594e-06, "loss": 0.1685, "step": 4270, "task_loss": 0.5627535581588745 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14412984251976013, "epoch": 1.55, "learning_rate": 1.6898424211254927e-06, "loss": 0.1639, "step": 4280, "task_loss": 0.2700430452823639 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15190498530864716, "epoch": 1.55, "learning_rate": 1.6884712398509966e-06, "loss": 0.1656, "step": 4290, "task_loss": 0.2561272382736206 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13519339263439178, "epoch": 1.55, "learning_rate": 1.6870975933058835e-06, "loss": 0.1663, "step": 4300, "task_loss": 0.3266107439994812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15758942067623138, "epoch": 1.56, "learning_rate": 1.6857214864088927e-06, "loss": 0.1749, "step": 4310, "task_loss": 0.4796781837940216 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18209287524223328, "epoch": 1.56, "learning_rate": 1.6843429240875726e-06, "loss": 0.1747, "step": 4320, "task_loss": 0.3897198438644409 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16047507524490356, "epoch": 1.56, "learning_rate": 1.6829619112782654e-06, "loss": 0.1644, "step": 4330, "task_loss": 0.4120858609676361 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1207498162984848, "epoch": 1.57, "learning_rate": 1.6815784529260868e-06, "loss": 0.1752, "step": 4340, "task_loss": 0.3987932801246643 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14837431907653809, "epoch": 1.57, "learning_rate": 1.6801925539849102e-06, "loss": 0.1675, "step": 4350, "task_loss": 0.4159763753414154 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16814565658569336, "epoch": 1.58, "learning_rate": 1.6788042194173485e-06, "loss": 0.1751, "step": 4360, "task_loss": 0.38205575942993164 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1617308259010315, "epoch": 1.58, "learning_rate": 1.6774134541947351e-06, "loss": 0.1879, "step": 4370, "task_loss": 0.35478347539901733 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16979068517684937, "epoch": 1.58, "learning_rate": 1.6760202632971074e-06, "loss": 0.1883, "step": 4380, "task_loss": 0.4965069890022278 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20151479542255402, "epoch": 1.59, "learning_rate": 1.6746246517131894e-06, "loss": 0.1801, "step": 4390, "task_loss": 0.4965303838253021 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1382327377796173, "epoch": 1.59, "learning_rate": 1.6732266244403722e-06, "loss": 0.1728, "step": 4400, "task_loss": 0.41574716567993164 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17530761659145355, "epoch": 1.59, "learning_rate": 1.6718261864846968e-06, "loss": 0.1828, "step": 4410, "task_loss": 0.518844485282898 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15779836475849152, "epoch": 1.6, "learning_rate": 1.6704233428608376e-06, "loss": 0.1705, "step": 4420, "task_loss": 0.3203689754009247 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1792672872543335, "epoch": 1.6, "learning_rate": 1.6690180985920818e-06, "loss": 0.1742, "step": 4430, "task_loss": 0.34858888387680054 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15927091240882874, "epoch": 1.6, "learning_rate": 1.6676104587103137e-06, "loss": 0.1661, "step": 4440, "task_loss": 0.42672163248062134 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1978459358215332, "epoch": 1.61, "learning_rate": 1.666200428255995e-06, "loss": 0.1701, "step": 4450, "task_loss": 0.42373642325401306 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12445167452096939, "epoch": 1.61, "learning_rate": 1.6647880122781487e-06, "loss": 0.1834, "step": 4460, "task_loss": 0.3997802138328552 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20981666445732117, "epoch": 1.62, "learning_rate": 1.6633732158343386e-06, "loss": 0.18, "step": 4470, "task_loss": 0.5796436667442322 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19721180200576782, "epoch": 1.62, "learning_rate": 1.6619560439906533e-06, "loss": 0.174, "step": 4480, "task_loss": 0.5064884424209595 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11864525079727173, "epoch": 1.62, "learning_rate": 1.6605365018216867e-06, "loss": 0.1624, "step": 4490, "task_loss": 0.33762532472610474 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16903159022331238, "epoch": 1.63, "learning_rate": 1.659114594410521e-06, "loss": 0.1668, "step": 4500, "task_loss": 0.3473363518714905 }, { "epoch": 1.63, "eval_exact_match": 83.71807000946073, "eval_f1": 90.03319320561076, "step": 4500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11900688707828522, "epoch": 1.63, "learning_rate": 1.6576903268487068e-06, "loss": 0.1645, "step": 4510, "task_loss": 0.38987505435943604 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1665954291820526, "epoch": 1.63, "learning_rate": 1.6562637042362466e-06, "loss": 0.1797, "step": 4520, "task_loss": 0.6573120951652527 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21217182278633118, "epoch": 1.64, "learning_rate": 1.6548347316815762e-06, "loss": 0.1782, "step": 4530, "task_loss": 0.5453046560287476 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13434047996997833, "epoch": 1.64, "learning_rate": 1.6534034143015454e-06, "loss": 0.1626, "step": 4540, "task_loss": 0.31057432293891907 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16948871314525604, "epoch": 1.64, "learning_rate": 1.6519697572214003e-06, "loss": 0.1811, "step": 4550, "task_loss": 0.40966400504112244 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14870113134384155, "epoch": 1.65, "learning_rate": 1.6505337655747651e-06, "loss": 0.1712, "step": 4560, "task_loss": 0.46824079751968384 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17549389600753784, "epoch": 1.65, "learning_rate": 1.649095444503624e-06, "loss": 0.172, "step": 4570, "task_loss": 0.47035887837409973 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16029421985149384, "epoch": 1.66, "learning_rate": 1.647654799158302e-06, "loss": 0.1705, "step": 4580, "task_loss": 0.556218147277832 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17698289453983307, "epoch": 1.66, "learning_rate": 1.6462118346974465e-06, "loss": 0.1826, "step": 4590, "task_loss": 0.48089244961738586 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15601950883865356, "epoch": 1.66, "learning_rate": 1.6447665562880102e-06, "loss": 0.1757, "step": 4600, "task_loss": 0.5369211435317993 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18061840534210205, "epoch": 1.67, "learning_rate": 1.6433189691052304e-06, "loss": 0.1852, "step": 4610, "task_loss": 0.5473412275314331 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16047485172748566, "epoch": 1.67, "learning_rate": 1.6418690783326124e-06, "loss": 0.1727, "step": 4620, "task_loss": 0.38301435112953186 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.213972270488739, "epoch": 1.67, "learning_rate": 1.6404168891619099e-06, "loss": 0.1884, "step": 4630, "task_loss": 0.4033554196357727 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19080013036727905, "epoch": 1.68, "learning_rate": 1.6389624067931063e-06, "loss": 0.1805, "step": 4640, "task_loss": 0.3459513783454895 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19626328349113464, "epoch": 1.68, "learning_rate": 1.6375056364343976e-06, "loss": 0.181, "step": 4650, "task_loss": 0.40217965841293335 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13872280716896057, "epoch": 1.68, "learning_rate": 1.6360465833021714e-06, "loss": 0.1681, "step": 4660, "task_loss": 0.6281946301460266 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17640194296836853, "epoch": 1.69, "learning_rate": 1.6345852526209898e-06, "loss": 0.1802, "step": 4670, "task_loss": 0.37003079056739807 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1716892421245575, "epoch": 1.69, "learning_rate": 1.6331216496235704e-06, "loss": 0.1797, "step": 4680, "task_loss": 0.26079100370407104 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16866642236709595, "epoch": 1.69, "learning_rate": 1.6316557795507681e-06, "loss": 0.1798, "step": 4690, "task_loss": 0.6609709858894348 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14990170300006866, "epoch": 1.7, "learning_rate": 1.6301876476515543e-06, "loss": 0.1595, "step": 4700, "task_loss": 0.40529322624206543 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15224753320217133, "epoch": 1.7, "learning_rate": 1.6287172591830013e-06, "loss": 0.1736, "step": 4710, "task_loss": 0.31739816069602966 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1693233698606491, "epoch": 1.71, "learning_rate": 1.62724461941026e-06, "loss": 0.1736, "step": 4720, "task_loss": 0.27699169516563416 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14131656289100647, "epoch": 1.71, "learning_rate": 1.6257697336065437e-06, "loss": 0.1751, "step": 4730, "task_loss": 0.2714840769767761 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17337501049041748, "epoch": 1.71, "learning_rate": 1.6242926070531081e-06, "loss": 0.1853, "step": 4740, "task_loss": 0.4388749599456787 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1466250717639923, "epoch": 1.72, "learning_rate": 1.6228132450392327e-06, "loss": 0.1855, "step": 4750, "task_loss": 0.7262935042381287 }, { "epoch": 1.72, "eval_exact_match": 83.65184484389782, "eval_f1": 90.0956056491023, "step": 4750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19323478639125824, "epoch": 1.72, "learning_rate": 1.6213316528622013e-06, "loss": 0.192, "step": 4760, "task_loss": 0.5335832834243774 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20000815391540527, "epoch": 1.72, "learning_rate": 1.6198478358272834e-06, "loss": 0.1739, "step": 4770, "task_loss": 0.38792669773101807 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1575516015291214, "epoch": 1.73, "learning_rate": 1.6183617992477161e-06, "loss": 0.1754, "step": 4780, "task_loss": 0.6262178421020508 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12828651070594788, "epoch": 1.73, "learning_rate": 1.6168735484446833e-06, "loss": 0.1726, "step": 4790, "task_loss": 0.4547956585884094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14732638001441956, "epoch": 1.73, "learning_rate": 1.6153830887472983e-06, "loss": 0.175, "step": 4800, "task_loss": 0.30394428968429565 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13601407408714294, "epoch": 1.74, "learning_rate": 1.6138904254925831e-06, "loss": 0.1766, "step": 4810, "task_loss": 0.3303675651550293 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16339826583862305, "epoch": 1.74, "learning_rate": 1.612395564025451e-06, "loss": 0.1643, "step": 4820, "task_loss": 0.4129944443702698 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1557186394929886, "epoch": 1.75, "learning_rate": 1.6108985096986862e-06, "loss": 0.1655, "step": 4830, "task_loss": 0.3643401563167572 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16704073548316956, "epoch": 1.75, "learning_rate": 1.6093992678729252e-06, "loss": 0.1857, "step": 4840, "task_loss": 0.33650100231170654 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1716494858264923, "epoch": 1.75, "learning_rate": 1.6078978439166372e-06, "loss": 0.18, "step": 4850, "task_loss": 0.47747802734375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13160017132759094, "epoch": 1.76, "learning_rate": 1.6063942432061062e-06, "loss": 0.1681, "step": 4860, "task_loss": 0.3302234411239624 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14614957571029663, "epoch": 1.76, "learning_rate": 1.6048884711254086e-06, "loss": 0.1657, "step": 4870, "task_loss": 0.4074876010417938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13879385590553284, "epoch": 1.76, "learning_rate": 1.6033805330663987e-06, "loss": 0.1656, "step": 4880, "task_loss": 0.5125229358673096 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1442992389202118, "epoch": 1.77, "learning_rate": 1.6018704344286844e-06, "loss": 0.1663, "step": 4890, "task_loss": 0.33929747343063354 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17423328757286072, "epoch": 1.77, "learning_rate": 1.6003581806196117e-06, "loss": 0.1806, "step": 4900, "task_loss": 0.6483819484710693 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16412119567394257, "epoch": 1.77, "learning_rate": 1.5988437770542426e-06, "loss": 0.178, "step": 4910, "task_loss": 0.37893688678741455 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13276568055152893, "epoch": 1.78, "learning_rate": 1.5973272291553381e-06, "loss": 0.1658, "step": 4920, "task_loss": 0.5486671924591064 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1630297601222992, "epoch": 1.78, "learning_rate": 1.5958085423533367e-06, "loss": 0.1533, "step": 4930, "task_loss": 0.3111371695995331 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15084078907966614, "epoch": 1.79, "learning_rate": 1.5942877220863367e-06, "loss": 0.156, "step": 4940, "task_loss": 0.4554649889469147 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1458296775817871, "epoch": 1.79, "learning_rate": 1.592764773800075e-06, "loss": 0.1713, "step": 4950, "task_loss": 0.24363714456558228 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19936226308345795, "epoch": 1.79, "learning_rate": 1.5912397029479088e-06, "loss": 0.179, "step": 4960, "task_loss": 0.42103761434555054 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15008442103862762, "epoch": 1.8, "learning_rate": 1.5897125149907961e-06, "loss": 0.1738, "step": 4970, "task_loss": 0.6863405704498291 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1526724100112915, "epoch": 1.8, "learning_rate": 1.5881832153972757e-06, "loss": 0.1757, "step": 4980, "task_loss": 0.24165448546409607 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15206190943717957, "epoch": 1.8, "learning_rate": 1.586651809643447e-06, "loss": 0.1781, "step": 4990, "task_loss": 0.46479007601737976 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13848325610160828, "epoch": 1.81, "learning_rate": 1.5851183032129524e-06, "loss": 0.1767, "step": 5000, "task_loss": 0.5114116668701172 }, { "epoch": 1.81, "eval_exact_match": 83.69914853358561, "eval_f1": 90.02717757860586, "step": 5000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15613943338394165, "epoch": 1.81, "learning_rate": 1.5835827015969554e-06, "loss": 0.1747, "step": 5010, "task_loss": 0.38822025060653687 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18799282610416412, "epoch": 1.81, "learning_rate": 1.5820450102941225e-06, "loss": 0.1926, "step": 5020, "task_loss": 0.5233188271522522 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15289965271949768, "epoch": 1.82, "learning_rate": 1.5805052348106021e-06, "loss": 0.1647, "step": 5030, "task_loss": 0.30817002058029175 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19276724755764008, "epoch": 1.82, "learning_rate": 1.5789633806600064e-06, "loss": 0.1794, "step": 5040, "task_loss": 0.6410900950431824 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1649995744228363, "epoch": 1.83, "learning_rate": 1.5774194533633908e-06, "loss": 0.1672, "step": 5050, "task_loss": 0.4419615864753723 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1605244129896164, "epoch": 1.83, "learning_rate": 1.5758734584492338e-06, "loss": 0.1971, "step": 5060, "task_loss": 0.5084717273712158 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18353916704654694, "epoch": 1.83, "learning_rate": 1.574325401453418e-06, "loss": 0.1884, "step": 5070, "task_loss": 0.46484851837158203 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1702931821346283, "epoch": 1.84, "learning_rate": 1.5727752879192093e-06, "loss": 0.1876, "step": 5080, "task_loss": 0.4551842212677002 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17121371626853943, "epoch": 1.84, "learning_rate": 1.5712231233972386e-06, "loss": 0.1634, "step": 5090, "task_loss": 0.23131704330444336 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12615904211997986, "epoch": 1.84, "learning_rate": 1.5696689134454802e-06, "loss": 0.1659, "step": 5100, "task_loss": 0.4349666237831116 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1561717838048935, "epoch": 1.85, "learning_rate": 1.5681126636292326e-06, "loss": 0.182, "step": 5110, "task_loss": 0.5192071199417114 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16207730770111084, "epoch": 1.85, "learning_rate": 1.5665543795210989e-06, "loss": 0.1699, "step": 5120, "task_loss": 1.110886573791504 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12102855741977692, "epoch": 1.85, "learning_rate": 1.564994066700967e-06, "loss": 0.1718, "step": 5130, "task_loss": 0.39551660418510437 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16230420768260956, "epoch": 1.86, "learning_rate": 1.5634317307559882e-06, "loss": 0.1865, "step": 5140, "task_loss": 0.47860944271087646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.183951735496521, "epoch": 1.86, "learning_rate": 1.561867377280559e-06, "loss": 0.174, "step": 5150, "task_loss": 0.3939540684223175 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16964015364646912, "epoch": 1.86, "learning_rate": 1.5603010118762997e-06, "loss": 0.1748, "step": 5160, "task_loss": 0.4153340458869934 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1425359845161438, "epoch": 1.87, "learning_rate": 1.5587326401520357e-06, "loss": 0.1766, "step": 5170, "task_loss": 0.4296700954437256 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2204303741455078, "epoch": 1.87, "learning_rate": 1.5571622677237754e-06, "loss": 0.1859, "step": 5180, "task_loss": 0.3614335060119629 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18071290850639343, "epoch": 1.88, "learning_rate": 1.5555899002146928e-06, "loss": 0.1698, "step": 5190, "task_loss": 0.4687485098838806 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.174323171377182, "epoch": 1.88, "learning_rate": 1.5540155432551041e-06, "loss": 0.172, "step": 5200, "task_loss": 0.45254212617874146 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14772723615169525, "epoch": 1.88, "learning_rate": 1.5524392024824508e-06, "loss": 0.1628, "step": 5210, "task_loss": 0.2734772562980652 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1434626281261444, "epoch": 1.89, "learning_rate": 1.5508608835412773e-06, "loss": 0.1659, "step": 5220, "task_loss": 0.3382406532764435 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1506517231464386, "epoch": 1.89, "learning_rate": 1.5492805920832117e-06, "loss": 0.1697, "step": 5230, "task_loss": 0.5476049184799194 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19249403476715088, "epoch": 1.89, "learning_rate": 1.5476983337669451e-06, "loss": 0.1781, "step": 5240, "task_loss": 0.4856623113155365 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16049784421920776, "epoch": 1.9, "learning_rate": 1.5461141142582115e-06, "loss": 0.1653, "step": 5250, "task_loss": 0.2884122133255005 }, { "epoch": 1.9, "eval_exact_match": 83.64238410596026, "eval_f1": 90.02509905020891, "step": 5250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.145027756690979, "epoch": 1.9, "learning_rate": 1.5445279392297672e-06, "loss": 0.1717, "step": 5260, "task_loss": 0.27939456701278687 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20636796951293945, "epoch": 1.9, "learning_rate": 1.5429398143613717e-06, "loss": 0.1707, "step": 5270, "task_loss": 0.3808209002017975 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19634541869163513, "epoch": 1.91, "learning_rate": 1.5413497453397658e-06, "loss": 0.1781, "step": 5280, "task_loss": 0.8501614332199097 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13850626349449158, "epoch": 1.91, "learning_rate": 1.5397577378586514e-06, "loss": 0.1751, "step": 5290, "task_loss": 0.5088834762573242 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1705392599105835, "epoch": 1.92, "learning_rate": 1.5381637976186733e-06, "loss": 0.1822, "step": 5300, "task_loss": 0.453264057636261 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14243380725383759, "epoch": 1.92, "learning_rate": 1.5365679303273956e-06, "loss": 0.1858, "step": 5310, "task_loss": 0.8157292604446411 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14648321270942688, "epoch": 1.92, "learning_rate": 1.5349701416992828e-06, "loss": 0.1604, "step": 5320, "task_loss": 0.44204074144363403 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13808469474315643, "epoch": 1.93, "learning_rate": 1.5333704374556802e-06, "loss": 0.1723, "step": 5330, "task_loss": 0.286069393157959 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15084746479988098, "epoch": 1.93, "learning_rate": 1.5317688233247918e-06, "loss": 0.1631, "step": 5340, "task_loss": 0.4517083764076233 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16623607277870178, "epoch": 1.93, "learning_rate": 1.5301653050416607e-06, "loss": 0.1859, "step": 5350, "task_loss": 0.34950506687164307 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15999534726142883, "epoch": 1.94, "learning_rate": 1.5285598883481488e-06, "loss": 0.1751, "step": 5360, "task_loss": 0.28156834840774536 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15565112233161926, "epoch": 1.94, "learning_rate": 1.526952578992915e-06, "loss": 0.1744, "step": 5370, "task_loss": 0.561008095741272 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19666364789009094, "epoch": 1.94, "learning_rate": 1.5253433827313959e-06, "loss": 0.187, "step": 5380, "task_loss": 0.5248037576675415 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12654519081115723, "epoch": 1.95, "learning_rate": 1.5237323053257849e-06, "loss": 0.1773, "step": 5390, "task_loss": 0.3182612359523773 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1488625556230545, "epoch": 1.95, "learning_rate": 1.5221193525450105e-06, "loss": 0.185, "step": 5400, "task_loss": 0.40940284729003906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17843294143676758, "epoch": 1.96, "learning_rate": 1.5205045301647176e-06, "loss": 0.1733, "step": 5410, "task_loss": 0.4217627942562103 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13039106130599976, "epoch": 1.96, "learning_rate": 1.5188878439672456e-06, "loss": 0.161, "step": 5420, "task_loss": 0.6658985614776611 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15170055627822876, "epoch": 1.96, "learning_rate": 1.5172692997416074e-06, "loss": 0.1821, "step": 5430, "task_loss": 0.37789255380630493 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1514560878276825, "epoch": 1.97, "learning_rate": 1.5156489032834689e-06, "loss": 0.1965, "step": 5440, "task_loss": 0.40159061551094055 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13064901530742645, "epoch": 1.97, "learning_rate": 1.5140266603951288e-06, "loss": 0.174, "step": 5450, "task_loss": 0.4346536099910736 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19731932878494263, "epoch": 1.97, "learning_rate": 1.5124025768854975e-06, "loss": 0.1911, "step": 5460, "task_loss": 0.5681113600730896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1736607700586319, "epoch": 1.98, "learning_rate": 1.5107766585700765e-06, "loss": 0.1773, "step": 5470, "task_loss": 0.40363502502441406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18201759457588196, "epoch": 1.98, "learning_rate": 1.509148911270937e-06, "loss": 0.1708, "step": 5480, "task_loss": 0.42472249269485474 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1587354838848114, "epoch": 1.98, "learning_rate": 1.5075193408166995e-06, "loss": 0.1696, "step": 5490, "task_loss": 0.5016087889671326 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14640197157859802, "epoch": 1.99, "learning_rate": 1.5058879530425129e-06, "loss": 0.1814, "step": 5500, "task_loss": 0.9462409019470215 }, { "epoch": 1.99, "eval_exact_match": 83.79375591296122, "eval_f1": 90.06572493808123, "step": 5500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17614376544952393, "epoch": 1.99, "learning_rate": 1.5042547537900334e-06, "loss": 0.1724, "step": 5510, "task_loss": 0.5415596961975098 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16879351437091827, "epoch": 1.99, "learning_rate": 1.5026197489074038e-06, "loss": 0.1792, "step": 5520, "task_loss": 0.7170099020004272 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13919247686862946, "epoch": 2.0, "learning_rate": 1.5009829442492321e-06, "loss": 0.1896, "step": 5530, "task_loss": 0.37363719940185547 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13153812289237976, "epoch": 2.0, "learning_rate": 1.4993443456765722e-06, "loss": 0.1611, "step": 5540, "task_loss": 0.27757397294044495 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20778411626815796, "epoch": 2.01, "learning_rate": 1.4977039590569e-06, "loss": 0.1793, "step": 5550, "task_loss": 0.8411521315574646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17973269522190094, "epoch": 2.01, "learning_rate": 1.4960617902640954e-06, "loss": 0.1709, "step": 5560, "task_loss": 0.396151065826416 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1556098610162735, "epoch": 2.01, "learning_rate": 1.4944178451784185e-06, "loss": 0.1685, "step": 5570, "task_loss": 0.6039637327194214 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1373765766620636, "epoch": 2.02, "learning_rate": 1.4927721296864911e-06, "loss": 0.1596, "step": 5580, "task_loss": 0.4312272071838379 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11067195981740952, "epoch": 2.02, "learning_rate": 1.4911246496812736e-06, "loss": 0.1822, "step": 5590, "task_loss": 0.2303856760263443 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14561320841312408, "epoch": 2.02, "learning_rate": 1.4894754110620462e-06, "loss": 0.1854, "step": 5600, "task_loss": 0.46308350563049316 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16818490624427795, "epoch": 2.03, "learning_rate": 1.4878244197343843e-06, "loss": 0.1812, "step": 5610, "task_loss": 0.76872318983078 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17642438411712646, "epoch": 2.03, "learning_rate": 1.4861716816101408e-06, "loss": 0.1649, "step": 5620, "task_loss": 0.31745028495788574 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1748821884393692, "epoch": 2.03, "learning_rate": 1.4845172026074229e-06, "loss": 0.181, "step": 5630, "task_loss": 0.5924056172370911 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1267927885055542, "epoch": 2.04, "learning_rate": 1.4828609886505719e-06, "loss": 0.1638, "step": 5640, "task_loss": 0.3259366750717163 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19988219439983368, "epoch": 2.04, "learning_rate": 1.4812030456701412e-06, "loss": 0.1733, "step": 5650, "task_loss": 0.5508837699890137 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15306976437568665, "epoch": 2.05, "learning_rate": 1.4795433796028758e-06, "loss": 0.1812, "step": 5660, "task_loss": 0.22895438969135284 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1771416962146759, "epoch": 2.05, "learning_rate": 1.4778819963916909e-06, "loss": 0.172, "step": 5670, "task_loss": 0.6265113949775696 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21116046607494354, "epoch": 2.05, "learning_rate": 1.4762189019856499e-06, "loss": 0.1802, "step": 5680, "task_loss": 0.5534864664077759 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18786293268203735, "epoch": 2.06, "learning_rate": 1.4745541023399435e-06, "loss": 0.1887, "step": 5690, "task_loss": 0.6557093858718872 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20581617951393127, "epoch": 2.06, "learning_rate": 1.4728876034158692e-06, "loss": 0.1742, "step": 5700, "task_loss": 0.4462522268295288 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13658413290977478, "epoch": 2.06, "learning_rate": 1.4712194111808093e-06, "loss": 0.1734, "step": 5710, "task_loss": 0.4286487102508545 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17904964089393616, "epoch": 2.07, "learning_rate": 1.4695495316082085e-06, "loss": 0.1724, "step": 5720, "task_loss": 0.5003844499588013 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1682061105966568, "epoch": 2.07, "learning_rate": 1.4678779706775547e-06, "loss": 0.1732, "step": 5730, "task_loss": 0.4084094762802124 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14257608354091644, "epoch": 2.07, "learning_rate": 1.466204734374355e-06, "loss": 0.17, "step": 5740, "task_loss": 0.25102728605270386 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16918689012527466, "epoch": 2.08, "learning_rate": 1.4645298286901168e-06, "loss": 0.1728, "step": 5750, "task_loss": 0.3457295298576355 }, { "epoch": 2.08, "eval_exact_match": 83.74645222327341, "eval_f1": 89.96027080749701, "step": 5750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14827761054039001, "epoch": 2.08, "learning_rate": 1.4628532596223252e-06, "loss": 0.1802, "step": 5760, "task_loss": 0.2933032512664795 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14048659801483154, "epoch": 2.09, "learning_rate": 1.461175033174421e-06, "loss": 0.1628, "step": 5770, "task_loss": 0.4381973147392273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14933642745018005, "epoch": 2.09, "learning_rate": 1.45949515535578e-06, "loss": 0.1618, "step": 5780, "task_loss": 0.7806057929992676 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15557800233364105, "epoch": 2.09, "learning_rate": 1.4578136321816908e-06, "loss": 0.1717, "step": 5790, "task_loss": 0.554169774055481 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13000071048736572, "epoch": 2.1, "learning_rate": 1.4561304696733342e-06, "loss": 0.1656, "step": 5800, "task_loss": 0.22170893847942352 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18678736686706543, "epoch": 2.1, "learning_rate": 1.4544456738577608e-06, "loss": 0.1789, "step": 5810, "task_loss": 0.2693910300731659 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16203993558883667, "epoch": 2.1, "learning_rate": 1.4527592507678702e-06, "loss": 0.1692, "step": 5820, "task_loss": 0.4591533839702606 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17562934756278992, "epoch": 2.11, "learning_rate": 1.4510712064423883e-06, "loss": 0.1771, "step": 5830, "task_loss": 0.42079657316207886 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19123172760009766, "epoch": 2.11, "learning_rate": 1.4493815469258466e-06, "loss": 0.1712, "step": 5840, "task_loss": 0.33686795830726624 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13670551776885986, "epoch": 2.11, "learning_rate": 1.4476902782685603e-06, "loss": 0.1687, "step": 5850, "task_loss": 0.40342459082603455 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12456159293651581, "epoch": 2.12, "learning_rate": 1.4459974065266062e-06, "loss": 0.1907, "step": 5860, "task_loss": 0.29705381393432617 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15771573781967163, "epoch": 2.12, "learning_rate": 1.444302937761802e-06, "loss": 0.1684, "step": 5870, "task_loss": 0.5629597306251526 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1410112977027893, "epoch": 2.13, "learning_rate": 1.442606878041684e-06, "loss": 0.1741, "step": 5880, "task_loss": 0.3297852873802185 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18847446143627167, "epoch": 2.13, "learning_rate": 1.4409092334394845e-06, "loss": 0.1759, "step": 5890, "task_loss": 0.48122259974479675 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15323123335838318, "epoch": 2.13, "learning_rate": 1.439210010034112e-06, "loss": 0.1687, "step": 5900, "task_loss": 0.3984212875366211 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1503305733203888, "epoch": 2.14, "learning_rate": 1.4375092139101279e-06, "loss": 0.1773, "step": 5910, "task_loss": 0.5217478275299072 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16366368532180786, "epoch": 2.14, "learning_rate": 1.4358068511577248e-06, "loss": 0.1824, "step": 5920, "task_loss": 0.7586253881454468 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13122406601905823, "epoch": 2.14, "learning_rate": 1.434102927872706e-06, "loss": 0.167, "step": 5930, "task_loss": 0.274662584066391 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17711997032165527, "epoch": 2.15, "learning_rate": 1.4323974501564617e-06, "loss": 0.1627, "step": 5940, "task_loss": 0.5237435102462769 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1662512719631195, "epoch": 2.15, "learning_rate": 1.4306904241159488e-06, "loss": 0.1883, "step": 5950, "task_loss": 0.2580040693283081 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15990257263183594, "epoch": 2.15, "learning_rate": 1.4289818558636686e-06, "loss": 0.1749, "step": 5960, "task_loss": 0.5619537830352783 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18278679251670837, "epoch": 2.16, "learning_rate": 1.4272717515176443e-06, "loss": 0.1619, "step": 5970, "task_loss": 0.32770949602127075 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18862998485565186, "epoch": 2.16, "learning_rate": 1.425560117201399e-06, "loss": 0.1882, "step": 5980, "task_loss": 0.4872078597545624 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13203898072242737, "epoch": 2.16, "learning_rate": 1.4238469590439358e-06, "loss": 0.1634, "step": 5990, "task_loss": 0.38844966888427734 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14819516241550446, "epoch": 2.17, "learning_rate": 1.4221322831797133e-06, "loss": 0.1705, "step": 6000, "task_loss": 0.5028584003448486 }, { "epoch": 2.17, "eval_exact_match": 83.67076631977294, "eval_f1": 89.97577151956914, "step": 6000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18577471375465393, "epoch": 2.17, "learning_rate": 1.420416095748625e-06, "loss": 0.1704, "step": 6010, "task_loss": 0.30688726902008057 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1723865419626236, "epoch": 2.18, "learning_rate": 1.4186984028959766e-06, "loss": 0.1799, "step": 6020, "task_loss": 0.42208823561668396 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14269012212753296, "epoch": 2.18, "learning_rate": 1.4169792107724647e-06, "loss": 0.161, "step": 6030, "task_loss": 0.3946291506290436 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17888927459716797, "epoch": 2.18, "learning_rate": 1.4152585255341547e-06, "loss": 0.1647, "step": 6040, "task_loss": 0.4846251904964447 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12159587442874908, "epoch": 2.19, "learning_rate": 1.4135363533424585e-06, "loss": 0.168, "step": 6050, "task_loss": 0.4344036281108856 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15208034217357635, "epoch": 2.19, "learning_rate": 1.4118127003641116e-06, "loss": 0.1769, "step": 6060, "task_loss": 0.43158042430877686 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1561131477355957, "epoch": 2.19, "learning_rate": 1.4100875727711533e-06, "loss": 0.1698, "step": 6070, "task_loss": 0.627993643283844 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16031518578529358, "epoch": 2.2, "learning_rate": 1.4083609767409019e-06, "loss": 0.1599, "step": 6080, "task_loss": 0.21831873059272766 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17668455839157104, "epoch": 2.2, "learning_rate": 1.406632918455935e-06, "loss": 0.1931, "step": 6090, "task_loss": 0.29211580753326416 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17368084192276, "epoch": 2.2, "learning_rate": 1.4049034041040647e-06, "loss": 0.1753, "step": 6100, "task_loss": 0.44469308853149414 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12126050889492035, "epoch": 2.21, "learning_rate": 1.4031724398783192e-06, "loss": 0.1514, "step": 6110, "task_loss": 0.26887062191963196 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16346772015094757, "epoch": 2.21, "learning_rate": 1.401440031976916e-06, "loss": 0.1721, "step": 6120, "task_loss": 0.5284141302108765 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1427915096282959, "epoch": 2.22, "learning_rate": 1.3997061866032439e-06, "loss": 0.1686, "step": 6130, "task_loss": 0.5118228793144226 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1318717747926712, "epoch": 2.22, "learning_rate": 1.3979709099658376e-06, "loss": 0.1831, "step": 6140, "task_loss": 0.41089510917663574 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1968536078929901, "epoch": 2.22, "learning_rate": 1.3962342082783582e-06, "loss": 0.1758, "step": 6150, "task_loss": 0.5051559209823608 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15244260430335999, "epoch": 2.23, "learning_rate": 1.3944960877595684e-06, "loss": 0.1816, "step": 6160, "task_loss": 0.3118957281112671 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13998454809188843, "epoch": 2.23, "learning_rate": 1.3927565546333123e-06, "loss": 0.1625, "step": 6170, "task_loss": 0.556923508644104 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13642212748527527, "epoch": 2.23, "learning_rate": 1.391015615128492e-06, "loss": 0.1645, "step": 6180, "task_loss": 0.35751280188560486 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1508575975894928, "epoch": 2.24, "learning_rate": 1.3892732754790455e-06, "loss": 0.1742, "step": 6190, "task_loss": 0.41975730657577515 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.10277068614959717, "epoch": 2.24, "learning_rate": 1.3875295419239242e-06, "loss": 0.1581, "step": 6200, "task_loss": 0.3505735695362091 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21870091557502747, "epoch": 2.24, "learning_rate": 1.385784420707071e-06, "loss": 0.1818, "step": 6210, "task_loss": 0.4473569691181183 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1878502368927002, "epoch": 2.25, "learning_rate": 1.3840379180773975e-06, "loss": 0.1768, "step": 6220, "task_loss": 0.47854381799697876 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15077124536037445, "epoch": 2.25, "learning_rate": 1.3822900402887626e-06, "loss": 0.1688, "step": 6230, "task_loss": 0.26176613569259644 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18990695476531982, "epoch": 2.26, "learning_rate": 1.3805407935999482e-06, "loss": 0.1832, "step": 6240, "task_loss": 0.546806275844574 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1040068119764328, "epoch": 2.26, "learning_rate": 1.378790184274639e-06, "loss": 0.1759, "step": 6250, "task_loss": 0.09956555813550949 }, { "epoch": 2.26, "eval_exact_match": 83.85998107852413, "eval_f1": 90.12192979235915, "step": 6250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.159162700176239, "epoch": 2.26, "learning_rate": 1.3770382185813986e-06, "loss": 0.1785, "step": 6260, "task_loss": 0.2582295536994934 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16721557080745697, "epoch": 2.27, "learning_rate": 1.3752849027936473e-06, "loss": 0.167, "step": 6270, "task_loss": 0.3453947603702545 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14433017373085022, "epoch": 2.27, "learning_rate": 1.3735302431896396e-06, "loss": 0.167, "step": 6280, "task_loss": 0.22514958679676056 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20126046240329742, "epoch": 2.27, "learning_rate": 1.3717742460524429e-06, "loss": 0.1887, "step": 6290, "task_loss": 0.6400581002235413 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21604719758033752, "epoch": 2.28, "learning_rate": 1.3700169176699125e-06, "loss": 0.1939, "step": 6300, "task_loss": 0.35505056381225586 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1490190625190735, "epoch": 2.28, "learning_rate": 1.3682582643346728e-06, "loss": 0.173, "step": 6310, "task_loss": 0.566085934638977 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12886354327201843, "epoch": 2.28, "learning_rate": 1.36649829234409e-06, "loss": 0.1792, "step": 6320, "task_loss": 0.32131779193878174 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14631399512290955, "epoch": 2.29, "learning_rate": 1.3647370080002541e-06, "loss": 0.1629, "step": 6330, "task_loss": 0.3405493497848511 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13580074906349182, "epoch": 2.29, "learning_rate": 1.3629744176099535e-06, "loss": 0.1617, "step": 6340, "task_loss": 0.2829074263572693 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16273626685142517, "epoch": 2.29, "learning_rate": 1.3612105274846538e-06, "loss": 0.174, "step": 6350, "task_loss": 0.34644442796707153 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16554933786392212, "epoch": 2.3, "learning_rate": 1.3594453439404733e-06, "loss": 0.1848, "step": 6360, "task_loss": 0.40179452300071716 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17596763372421265, "epoch": 2.3, "learning_rate": 1.357678873298164e-06, "loss": 0.1648, "step": 6370, "task_loss": 0.5932073593139648 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15840421617031097, "epoch": 2.31, "learning_rate": 1.3559111218830848e-06, "loss": 0.1796, "step": 6380, "task_loss": 0.6633092761039734 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16752852499485016, "epoch": 2.31, "learning_rate": 1.3541420960251813e-06, "loss": 0.1728, "step": 6390, "task_loss": 0.6710745096206665 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1526416689157486, "epoch": 2.31, "learning_rate": 1.3523718020589634e-06, "loss": 0.1717, "step": 6400, "task_loss": 0.21283848583698273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14025653898715973, "epoch": 2.32, "learning_rate": 1.3506002463234811e-06, "loss": 0.1718, "step": 6410, "task_loss": 0.22466395795345306 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14743945002555847, "epoch": 2.32, "learning_rate": 1.348827435162302e-06, "loss": 0.1761, "step": 6420, "task_loss": 0.40274888277053833 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18202725052833557, "epoch": 2.32, "learning_rate": 1.3470533749234906e-06, "loss": 0.169, "step": 6430, "task_loss": 0.7552968859672546 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17869505286216736, "epoch": 2.33, "learning_rate": 1.3452780719595831e-06, "loss": 0.1666, "step": 6440, "task_loss": 0.2956041097640991 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1480785310268402, "epoch": 2.33, "learning_rate": 1.3435015326275654e-06, "loss": 0.1684, "step": 6450, "task_loss": 0.3291912376880646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15362969040870667, "epoch": 2.33, "learning_rate": 1.3417237632888513e-06, "loss": 0.1661, "step": 6460, "task_loss": 0.38771361112594604 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1606786698102951, "epoch": 2.34, "learning_rate": 1.3399447703092584e-06, "loss": 0.176, "step": 6470, "task_loss": 0.5447548627853394 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14728298783302307, "epoch": 2.34, "learning_rate": 1.3381645600589865e-06, "loss": 0.1682, "step": 6480, "task_loss": 0.4820564091205597 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16725045442581177, "epoch": 2.35, "learning_rate": 1.3363831389125936e-06, "loss": 0.1777, "step": 6490, "task_loss": 0.41131776571273804 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16478011012077332, "epoch": 2.35, "learning_rate": 1.3346005132489739e-06, "loss": 0.1756, "step": 6500, "task_loss": 0.3583562970161438 }, { "epoch": 2.35, "eval_exact_match": 83.61400189214758, "eval_f1": 89.97690778977226, "step": 6500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21100406348705292, "epoch": 2.35, "learning_rate": 1.3328166894513346e-06, "loss": 0.1659, "step": 6510, "task_loss": 0.41332000494003296 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14449000358581543, "epoch": 2.36, "learning_rate": 1.3310316739071738e-06, "loss": 0.1687, "step": 6520, "task_loss": 0.28724896907806396 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15244629979133606, "epoch": 2.36, "learning_rate": 1.329245473008256e-06, "loss": 0.1682, "step": 6530, "task_loss": 0.37325456738471985 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1374513804912567, "epoch": 2.36, "learning_rate": 1.3274580931505911e-06, "loss": 0.1838, "step": 6540, "task_loss": 0.47368472814559937 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1499250829219818, "epoch": 2.37, "learning_rate": 1.3256695407344103e-06, "loss": 0.1662, "step": 6550, "task_loss": 0.4007405638694763 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17637519538402557, "epoch": 2.37, "learning_rate": 1.3238798221641427e-06, "loss": 0.1872, "step": 6560, "task_loss": 0.4767989218235016 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19467362761497498, "epoch": 2.37, "learning_rate": 1.3220889438483944e-06, "loss": 0.168, "step": 6570, "task_loss": 0.5928551554679871 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1555582582950592, "epoch": 2.38, "learning_rate": 1.3202969121999234e-06, "loss": 0.1708, "step": 6580, "task_loss": 0.5098388195037842 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13599567115306854, "epoch": 2.38, "learning_rate": 1.3185037336356182e-06, "loss": 0.1658, "step": 6590, "task_loss": 0.3146175146102905 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1555548906326294, "epoch": 2.39, "learning_rate": 1.316709414576474e-06, "loss": 0.1726, "step": 6600, "task_loss": 0.5938575863838196 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12219253927469254, "epoch": 2.39, "learning_rate": 1.3149139614475693e-06, "loss": 0.174, "step": 6610, "task_loss": 0.3089248538017273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1555480659008026, "epoch": 2.39, "learning_rate": 1.3131173806780443e-06, "loss": 0.1676, "step": 6620, "task_loss": 0.42747241258621216 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17220956087112427, "epoch": 2.4, "learning_rate": 1.311319678701076e-06, "loss": 0.1887, "step": 6630, "task_loss": 0.4911194443702698 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.197723388671875, "epoch": 2.4, "learning_rate": 1.3095208619538574e-06, "loss": 0.1714, "step": 6640, "task_loss": 0.36501121520996094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14832803606987, "epoch": 2.4, "learning_rate": 1.3077209368775724e-06, "loss": 0.1724, "step": 6650, "task_loss": 0.4433749318122864 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17430132627487183, "epoch": 2.41, "learning_rate": 1.3059199099173741e-06, "loss": 0.1818, "step": 6660, "task_loss": 0.5781666040420532 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16755987703800201, "epoch": 2.41, "learning_rate": 1.3041177875223612e-06, "loss": 0.1678, "step": 6670, "task_loss": 0.6399859189987183 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16882233321666718, "epoch": 2.41, "learning_rate": 1.302314576145554e-06, "loss": 0.1698, "step": 6680, "task_loss": 0.6496683359146118 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1583394557237625, "epoch": 2.42, "learning_rate": 1.3005102822438738e-06, "loss": 0.1789, "step": 6690, "task_loss": 0.32157063484191895 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13417063653469086, "epoch": 2.42, "learning_rate": 1.2987049122781171e-06, "loss": 0.1802, "step": 6700, "task_loss": 0.2793649435043335 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15821221470832825, "epoch": 2.43, "learning_rate": 1.2968984727129332e-06, "loss": 0.1818, "step": 6710, "task_loss": 0.34716230630874634 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16095976531505585, "epoch": 2.43, "learning_rate": 1.295090970016803e-06, "loss": 0.1825, "step": 6720, "task_loss": 0.19834813475608826 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14987853169441223, "epoch": 2.43, "learning_rate": 1.2932824106620125e-06, "loss": 0.1635, "step": 6730, "task_loss": 0.7775259613990784 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16203686594963074, "epoch": 2.44, "learning_rate": 1.291472801124632e-06, "loss": 0.1696, "step": 6740, "task_loss": 0.39739635586738586 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14740723371505737, "epoch": 2.44, "learning_rate": 1.2896621478844931e-06, "loss": 0.1799, "step": 6750, "task_loss": 0.3119643032550812 }, { "epoch": 2.44, "eval_exact_match": 83.70860927152317, "eval_f1": 89.9757275904899, "step": 6750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18803681433200836, "epoch": 2.44, "learning_rate": 1.2878504574251637e-06, "loss": 0.1717, "step": 6760, "task_loss": 0.4526791572570801 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17158998548984528, "epoch": 2.45, "learning_rate": 1.2860377362339257e-06, "loss": 0.1711, "step": 6770, "task_loss": 0.4078561067581177 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13310539722442627, "epoch": 2.45, "learning_rate": 1.2842239908017526e-06, "loss": 0.1676, "step": 6780, "task_loss": 0.27124446630477905 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.166331946849823, "epoch": 2.45, "learning_rate": 1.2824092276232853e-06, "loss": 0.1686, "step": 6790, "task_loss": 0.6495641469955444 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16789960861206055, "epoch": 2.46, "learning_rate": 1.280593453196808e-06, "loss": 0.1742, "step": 6800, "task_loss": 0.4059957265853882 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1527012437582016, "epoch": 2.46, "learning_rate": 1.2787766740242277e-06, "loss": 0.1686, "step": 6810, "task_loss": 0.3953001797199249 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16180524230003357, "epoch": 2.46, "learning_rate": 1.2769588966110476e-06, "loss": 0.1789, "step": 6820, "task_loss": 0.49090176820755005 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1586184799671173, "epoch": 2.47, "learning_rate": 1.2751401274663463e-06, "loss": 0.158, "step": 6830, "task_loss": 0.24430197477340698 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12404391914606094, "epoch": 2.47, "learning_rate": 1.2733203731027534e-06, "loss": 0.1605, "step": 6840, "task_loss": 0.253868043422699 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17260029911994934, "epoch": 2.48, "learning_rate": 1.2714996400364262e-06, "loss": 0.1541, "step": 6850, "task_loss": 0.4611034691333771 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1461942195892334, "epoch": 2.48, "learning_rate": 1.2696779347870265e-06, "loss": 0.1741, "step": 6860, "task_loss": 0.5523576736450195 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12024559080600739, "epoch": 2.48, "learning_rate": 1.2678552638776979e-06, "loss": 0.1755, "step": 6870, "task_loss": 0.3268803060054779 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18247783184051514, "epoch": 2.49, "learning_rate": 1.2660316338350408e-06, "loss": 0.1767, "step": 6880, "task_loss": 0.3315047025680542 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21110492944717407, "epoch": 2.49, "learning_rate": 1.2642070511890905e-06, "loss": 0.1858, "step": 6890, "task_loss": 0.6726840734481812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13523760437965393, "epoch": 2.49, "learning_rate": 1.2623815224732941e-06, "loss": 0.1731, "step": 6900, "task_loss": 0.17781083285808563 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13068142533302307, "epoch": 2.5, "learning_rate": 1.2605550542244854e-06, "loss": 0.1613, "step": 6910, "task_loss": 0.41651782393455505 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13178616762161255, "epoch": 2.5, "learning_rate": 1.2587276529828628e-06, "loss": 0.1858, "step": 6920, "task_loss": 0.48381632566452026 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1575162410736084, "epoch": 2.5, "learning_rate": 1.2568993252919652e-06, "loss": 0.1753, "step": 6930, "task_loss": 0.27174267172813416 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21238577365875244, "epoch": 2.51, "learning_rate": 1.25507007769865e-06, "loss": 0.1898, "step": 6940, "task_loss": 0.3620792031288147 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15523602068424225, "epoch": 2.51, "learning_rate": 1.2532399167530674e-06, "loss": 0.1751, "step": 6950, "task_loss": 0.24264416098594666 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20205236971378326, "epoch": 2.52, "learning_rate": 1.2514088490086387e-06, "loss": 0.1882, "step": 6960, "task_loss": 0.49486780166625977 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1760149896144867, "epoch": 2.52, "learning_rate": 1.2495768810220321e-06, "loss": 0.1753, "step": 6970, "task_loss": 0.5111973881721497 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1396222710609436, "epoch": 2.52, "learning_rate": 1.2477440193531393e-06, "loss": 0.1635, "step": 6980, "task_loss": 0.4071466624736786 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21526896953582764, "epoch": 2.53, "learning_rate": 1.2459102705650523e-06, "loss": 0.184, "step": 6990, "task_loss": 0.43278732895851135 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15099546313285828, "epoch": 2.53, "learning_rate": 1.24407564122404e-06, "loss": 0.1687, "step": 7000, "task_loss": 0.4598119854927063 }, { "epoch": 2.53, "eval_exact_match": 83.38694418164617, "eval_f1": 89.83912379851094, "step": 7000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13700950145721436, "epoch": 2.53, "learning_rate": 1.2422401378995231e-06, "loss": 0.1684, "step": 7010, "task_loss": 0.23394280672073364 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19724495708942413, "epoch": 2.54, "learning_rate": 1.2404037671640534e-06, "loss": 0.1905, "step": 7020, "task_loss": 0.5948208570480347 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17370860278606415, "epoch": 2.54, "learning_rate": 1.2385665355932874e-06, "loss": 0.1626, "step": 7030, "task_loss": 0.5977954268455505 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15155650675296783, "epoch": 2.54, "learning_rate": 1.2367284497659659e-06, "loss": 0.179, "step": 7040, "task_loss": 0.7188633680343628 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.26555588841438293, "epoch": 2.55, "learning_rate": 1.2348895162638862e-06, "loss": 0.1875, "step": 7050, "task_loss": 0.5506167411804199 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1538572609424591, "epoch": 2.55, "learning_rate": 1.2330497416718824e-06, "loss": 0.1718, "step": 7060, "task_loss": 0.6333431601524353 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17727243900299072, "epoch": 2.56, "learning_rate": 1.2312091325778004e-06, "loss": 0.1747, "step": 7070, "task_loss": 0.6085351705551147 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1532391905784607, "epoch": 2.56, "learning_rate": 1.229367695572474e-06, "loss": 0.1628, "step": 7080, "task_loss": 0.14348584413528442 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19841280579566956, "epoch": 2.56, "learning_rate": 1.2275254372497012e-06, "loss": 0.1637, "step": 7090, "task_loss": 0.5597392916679382 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1571100950241089, "epoch": 2.57, "learning_rate": 1.225682364206222e-06, "loss": 0.1766, "step": 7100, "task_loss": 0.44082239270210266 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1467963010072708, "epoch": 2.57, "learning_rate": 1.2238384830416926e-06, "loss": 0.1659, "step": 7110, "task_loss": 0.31584417819976807 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1948840469121933, "epoch": 2.57, "learning_rate": 1.2219938003586635e-06, "loss": 0.1776, "step": 7120, "task_loss": 0.32976752519607544 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1948937028646469, "epoch": 2.58, "learning_rate": 1.2201483227625549e-06, "loss": 0.1675, "step": 7130, "task_loss": 0.6085909008979797 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1637001782655716, "epoch": 2.58, "learning_rate": 1.2183020568616342e-06, "loss": 0.1846, "step": 7140, "task_loss": 0.47807058691978455 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16208887100219727, "epoch": 2.58, "learning_rate": 1.2164550092669906e-06, "loss": 0.177, "step": 7150, "task_loss": 0.4254646599292755 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13784141838550568, "epoch": 2.59, "learning_rate": 1.214607186592513e-06, "loss": 0.17, "step": 7160, "task_loss": 0.6249512434005737 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17609910666942596, "epoch": 2.59, "learning_rate": 1.212758595454866e-06, "loss": 0.1824, "step": 7170, "task_loss": 0.6437938213348389 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1495467573404312, "epoch": 2.59, "learning_rate": 1.210909242473464e-06, "loss": 0.1638, "step": 7180, "task_loss": 0.3057920038700104 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22128623723983765, "epoch": 2.6, "learning_rate": 1.2090591342704523e-06, "loss": 0.1908, "step": 7190, "task_loss": 0.3862370252609253 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2089284509420395, "epoch": 2.6, "learning_rate": 1.2072082774706783e-06, "loss": 0.1803, "step": 7200, "task_loss": 0.3655293583869934 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1751350313425064, "epoch": 2.61, "learning_rate": 1.205356678701671e-06, "loss": 0.1847, "step": 7210, "task_loss": 0.6336475610733032 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14995576441287994, "epoch": 2.61, "learning_rate": 1.2035043445936158e-06, "loss": 0.1643, "step": 7220, "task_loss": 0.5754516124725342 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.191546231508255, "epoch": 2.61, "learning_rate": 1.201651281779331e-06, "loss": 0.1847, "step": 7230, "task_loss": 0.5577281713485718 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17802578210830688, "epoch": 2.62, "learning_rate": 1.1997974968942448e-06, "loss": 0.1745, "step": 7240, "task_loss": 0.45092934370040894 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1552976369857788, "epoch": 2.62, "learning_rate": 1.1979429965763707e-06, "loss": 0.1731, "step": 7250, "task_loss": 0.5546841621398926 }, { "epoch": 2.62, "eval_exact_match": 83.4720908230842, "eval_f1": 89.94530706324616, "step": 7250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12661629915237427, "epoch": 2.62, "learning_rate": 1.1960877874662842e-06, "loss": 0.159, "step": 7260, "task_loss": 0.24371370673179626 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17779144644737244, "epoch": 2.63, "learning_rate": 1.1942318762070984e-06, "loss": 0.1738, "step": 7270, "task_loss": 0.29593953490257263 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1766924411058426, "epoch": 2.63, "learning_rate": 1.1923752694444413e-06, "loss": 0.1862, "step": 7280, "task_loss": 0.5446988344192505 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13910958170890808, "epoch": 2.63, "learning_rate": 1.1905179738264307e-06, "loss": 0.1742, "step": 7290, "task_loss": 0.2903839349746704 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15694880485534668, "epoch": 2.64, "learning_rate": 1.1886599960036514e-06, "loss": 0.1825, "step": 7300, "task_loss": 0.37020811438560486 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1348649561405182, "epoch": 2.64, "learning_rate": 1.186801342629131e-06, "loss": 0.1704, "step": 7310, "task_loss": 0.31780362129211426 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13491874933242798, "epoch": 2.65, "learning_rate": 1.184942020358316e-06, "loss": 0.1796, "step": 7320, "task_loss": 0.2244960069656372 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1295914649963379, "epoch": 2.65, "learning_rate": 1.1830820358490481e-06, "loss": 0.1742, "step": 7330, "task_loss": 0.24627065658569336 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1553851217031479, "epoch": 2.65, "learning_rate": 1.1812213957615407e-06, "loss": 0.1855, "step": 7340, "task_loss": 0.3259097635746002 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16126763820648193, "epoch": 2.66, "learning_rate": 1.179360106758354e-06, "loss": 0.1701, "step": 7350, "task_loss": 0.48443925380706787 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16280007362365723, "epoch": 2.66, "learning_rate": 1.1774981755043721e-06, "loss": 0.1779, "step": 7360, "task_loss": 0.5628104209899902 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2028045356273651, "epoch": 2.66, "learning_rate": 1.1756356086667795e-06, "loss": 0.1779, "step": 7370, "task_loss": 0.4160672724246979 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13652898371219635, "epoch": 2.67, "learning_rate": 1.1737724129150357e-06, "loss": 0.1713, "step": 7380, "task_loss": 0.46745267510414124 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18952497839927673, "epoch": 2.67, "learning_rate": 1.1719085949208525e-06, "loss": 0.1726, "step": 7390, "task_loss": 0.7140260934829712 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1258544772863388, "epoch": 2.67, "learning_rate": 1.1700441613581702e-06, "loss": 0.1553, "step": 7400, "task_loss": 0.26404812932014465 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13442295789718628, "epoch": 2.68, "learning_rate": 1.168179118903133e-06, "loss": 0.1707, "step": 7410, "task_loss": 0.4021362066268921 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16747888922691345, "epoch": 2.68, "learning_rate": 1.1663134742340648e-06, "loss": 0.1748, "step": 7420, "task_loss": 0.2707682251930237 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13281439244747162, "epoch": 2.69, "learning_rate": 1.164447234031447e-06, "loss": 0.1679, "step": 7430, "task_loss": 0.4617811143398285 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1636105477809906, "epoch": 2.69, "learning_rate": 1.1625804049778931e-06, "loss": 0.1688, "step": 7440, "task_loss": 0.6173032522201538 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18871071934700012, "epoch": 2.69, "learning_rate": 1.160712993758125e-06, "loss": 0.1944, "step": 7450, "task_loss": 0.4313560724258423 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.143826425075531, "epoch": 2.7, "learning_rate": 1.1588450070589492e-06, "loss": 0.1692, "step": 7460, "task_loss": 0.496171236038208 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12385374307632446, "epoch": 2.7, "learning_rate": 1.1569764515692334e-06, "loss": 0.1574, "step": 7470, "task_loss": 0.521141529083252 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1483931839466095, "epoch": 2.7, "learning_rate": 1.1551073339798803e-06, "loss": 0.1631, "step": 7480, "task_loss": 0.30746322870254517 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1397826075553894, "epoch": 2.71, "learning_rate": 1.1532376609838079e-06, "loss": 0.1604, "step": 7490, "task_loss": 0.2872992753982544 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18801283836364746, "epoch": 2.71, "learning_rate": 1.151367439275921e-06, "loss": 0.1768, "step": 7500, "task_loss": 0.588277280330658 }, { "epoch": 2.71, "eval_exact_match": 83.66130558183538, "eval_f1": 90.07695887230169, "step": 7500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15823519229888916, "epoch": 2.71, "learning_rate": 1.1494966755530901e-06, "loss": 0.1851, "step": 7510, "task_loss": 0.3757522404193878 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14509174227714539, "epoch": 2.72, "learning_rate": 1.1476253765141267e-06, "loss": 0.1717, "step": 7520, "task_loss": 0.15500980615615845 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.147189199924469, "epoch": 2.72, "learning_rate": 1.1457535488597587e-06, "loss": 0.1668, "step": 7530, "task_loss": 0.34534624218940735 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14676904678344727, "epoch": 2.72, "learning_rate": 1.1438811992926067e-06, "loss": 0.1817, "step": 7540, "task_loss": 0.49516406655311584 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.195867121219635, "epoch": 2.73, "learning_rate": 1.1420083345171608e-06, "loss": 0.1738, "step": 7550, "task_loss": 0.6467814445495605 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11699579656124115, "epoch": 2.73, "learning_rate": 1.140134961239755e-06, "loss": 0.1672, "step": 7560, "task_loss": 0.3284699022769928 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14966784417629242, "epoch": 2.74, "learning_rate": 1.1382610861685456e-06, "loss": 0.1768, "step": 7570, "task_loss": 0.33868739008903503 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16117683053016663, "epoch": 2.74, "learning_rate": 1.1363867160134843e-06, "loss": 0.1679, "step": 7580, "task_loss": 0.37554022669792175 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1663319170475006, "epoch": 2.74, "learning_rate": 1.1345118574862967e-06, "loss": 0.1682, "step": 7590, "task_loss": 0.5620455741882324 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1660982072353363, "epoch": 2.75, "learning_rate": 1.1326365173004555e-06, "loss": 0.1822, "step": 7600, "task_loss": 0.4505634307861328 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13936099410057068, "epoch": 2.75, "learning_rate": 1.1307607021711606e-06, "loss": 0.1681, "step": 7610, "task_loss": 0.3778277635574341 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17037974298000336, "epoch": 2.75, "learning_rate": 1.12888441881531e-06, "loss": 0.1794, "step": 7620, "task_loss": 0.3082660436630249 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19312995672225952, "epoch": 2.76, "learning_rate": 1.1270076739514805e-06, "loss": 0.1784, "step": 7630, "task_loss": 0.6790364980697632 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14956381916999817, "epoch": 2.76, "learning_rate": 1.1251304742998999e-06, "loss": 0.1646, "step": 7640, "task_loss": 0.3310818672180176 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1720355749130249, "epoch": 2.76, "learning_rate": 1.1232528265824252e-06, "loss": 0.1871, "step": 7650, "task_loss": 0.5396093130111694 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12346095591783524, "epoch": 2.77, "learning_rate": 1.1213747375225178e-06, "loss": 0.1782, "step": 7660, "task_loss": 0.5943026542663574 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17838293313980103, "epoch": 2.77, "learning_rate": 1.1194962138452194e-06, "loss": 0.1751, "step": 7670, "task_loss": 0.49117571115493774 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15319928526878357, "epoch": 2.78, "learning_rate": 1.1176172622771276e-06, "loss": 0.1701, "step": 7680, "task_loss": 0.2504529356956482 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12942051887512207, "epoch": 2.78, "learning_rate": 1.115737889546373e-06, "loss": 0.1704, "step": 7690, "task_loss": 0.8131101131439209 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17500701546669006, "epoch": 2.78, "learning_rate": 1.1138581023825937e-06, "loss": 0.1713, "step": 7700, "task_loss": 0.3743223249912262 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15439695119857788, "epoch": 2.79, "learning_rate": 1.1119779075169117e-06, "loss": 0.1639, "step": 7710, "task_loss": 0.5529880523681641 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15999752283096313, "epoch": 2.79, "learning_rate": 1.1100973116819092e-06, "loss": 0.1556, "step": 7720, "task_loss": 0.3388964533805847 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18036921322345734, "epoch": 2.79, "learning_rate": 1.1082163216116044e-06, "loss": 0.1663, "step": 7730, "task_loss": 0.5201153755187988 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14596882462501526, "epoch": 2.8, "learning_rate": 1.1063349440414265e-06, "loss": 0.1598, "step": 7740, "task_loss": 0.3204638361930847 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1631544828414917, "epoch": 2.8, "learning_rate": 1.1044531857081927e-06, "loss": 0.1804, "step": 7750, "task_loss": 0.8809808492660522 }, { "epoch": 2.8, "eval_exact_match": 83.52885525070955, "eval_f1": 89.94545472443673, "step": 7750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18084211647510529, "epoch": 2.8, "learning_rate": 1.1025710533500838e-06, "loss": 0.1682, "step": 7760, "task_loss": 0.5090347528457642 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1455087661743164, "epoch": 2.81, "learning_rate": 1.1006885537066194e-06, "loss": 0.1893, "step": 7770, "task_loss": 0.32133185863494873 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12667664885520935, "epoch": 2.81, "learning_rate": 1.0988056935186346e-06, "loss": 0.1586, "step": 7780, "task_loss": 0.2582120895385742 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16505160927772522, "epoch": 2.82, "learning_rate": 1.0969224795282556e-06, "loss": 0.1773, "step": 7790, "task_loss": 0.6498008966445923 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12269654124975204, "epoch": 2.82, "learning_rate": 1.0950389184788754e-06, "loss": 0.1786, "step": 7800, "task_loss": 0.7851200103759766 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15462878346443176, "epoch": 2.82, "learning_rate": 1.0931550171151295e-06, "loss": 0.1717, "step": 7810, "task_loss": 0.45008930563926697 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17051701247692108, "epoch": 2.83, "learning_rate": 1.0912707821828724e-06, "loss": 0.1914, "step": 7820, "task_loss": 0.40274274349212646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14285723865032196, "epoch": 2.83, "learning_rate": 1.089386220429153e-06, "loss": 0.1757, "step": 7830, "task_loss": 0.513251543045044 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19114567339420319, "epoch": 2.83, "learning_rate": 1.0875013386021893e-06, "loss": 0.1888, "step": 7840, "task_loss": 0.5363246202468872 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1428094506263733, "epoch": 2.84, "learning_rate": 1.0856161434513475e-06, "loss": 0.1713, "step": 7850, "task_loss": 0.4419270157814026 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15643715858459473, "epoch": 2.84, "learning_rate": 1.0837306417271147e-06, "loss": 0.1681, "step": 7860, "task_loss": 0.44533056020736694 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13387146592140198, "epoch": 2.84, "learning_rate": 1.0818448401810753e-06, "loss": 0.1737, "step": 7870, "task_loss": 0.4950907230377197 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15607303380966187, "epoch": 2.85, "learning_rate": 1.079958745565888e-06, "loss": 0.1749, "step": 7880, "task_loss": 0.6860287189483643 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1685732752084732, "epoch": 2.85, "learning_rate": 1.0780723646352605e-06, "loss": 0.1731, "step": 7890, "task_loss": 0.5070062875747681 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15234628319740295, "epoch": 2.86, "learning_rate": 1.076185704143926e-06, "loss": 0.1662, "step": 7900, "task_loss": 0.32623738050460815 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14870676398277283, "epoch": 2.86, "learning_rate": 1.0742987708476185e-06, "loss": 0.1779, "step": 7910, "task_loss": 0.43853697180747986 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11807222664356232, "epoch": 2.86, "learning_rate": 1.0724115715030495e-06, "loss": 0.1545, "step": 7920, "task_loss": 0.3483704626560211 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1611451804637909, "epoch": 2.87, "learning_rate": 1.0705241128678824e-06, "loss": 0.1668, "step": 7930, "task_loss": 0.4193282723426819 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15785019099712372, "epoch": 2.87, "learning_rate": 1.0686364017007093e-06, "loss": 0.1789, "step": 7940, "task_loss": 0.315855473279953 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19590085744857788, "epoch": 2.87, "learning_rate": 1.0667484447610261e-06, "loss": 0.1817, "step": 7950, "task_loss": 0.45036888122558594 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1536681354045868, "epoch": 2.88, "learning_rate": 1.0648602488092104e-06, "loss": 0.1783, "step": 7960, "task_loss": 0.39551639556884766 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12338414788246155, "epoch": 2.88, "learning_rate": 1.0629718206064935e-06, "loss": 0.151, "step": 7970, "task_loss": 0.35970136523246765 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12747308611869812, "epoch": 2.88, "learning_rate": 1.06108316691494e-06, "loss": 0.1743, "step": 7980, "task_loss": 0.3183417320251465 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15406057238578796, "epoch": 2.89, "learning_rate": 1.0591942944974212e-06, "loss": 0.1725, "step": 7990, "task_loss": 0.7116891145706177 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1521359086036682, "epoch": 2.89, "learning_rate": 1.0573052101175915e-06, "loss": 0.1804, "step": 8000, "task_loss": 0.45792704820632935 }, { "epoch": 2.89, "eval_exact_match": 83.81267738883633, "eval_f1": 90.15074155517108, "step": 8000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19931340217590332, "epoch": 2.89, "learning_rate": 1.0554159205398643e-06, "loss": 0.1716, "step": 8010, "task_loss": 0.7359171509742737 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1555495262145996, "epoch": 2.9, "learning_rate": 1.0535264325293885e-06, "loss": 0.1763, "step": 8020, "task_loss": 0.5659838914871216 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18331894278526306, "epoch": 2.9, "learning_rate": 1.0516367528520227e-06, "loss": 0.1768, "step": 8030, "task_loss": 0.6728960871696472 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19219288229942322, "epoch": 2.91, "learning_rate": 1.0497468882743122e-06, "loss": 0.1806, "step": 8040, "task_loss": 0.504513144493103 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12473896145820618, "epoch": 2.91, "learning_rate": 1.0478568455634641e-06, "loss": 0.1709, "step": 8050, "task_loss": 0.22338946163654327 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15949636697769165, "epoch": 2.91, "learning_rate": 1.045966631487324e-06, "loss": 0.1682, "step": 8060, "task_loss": 0.4108890891075134 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16450482606887817, "epoch": 2.92, "learning_rate": 1.0440762528143505e-06, "loss": 0.1658, "step": 8070, "task_loss": 0.38757970929145813 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14186137914657593, "epoch": 2.92, "learning_rate": 1.042185716313592e-06, "loss": 0.1741, "step": 8080, "task_loss": 0.3712804913520813 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16010576486587524, "epoch": 2.92, "learning_rate": 1.040295028754661e-06, "loss": 0.1657, "step": 8090, "task_loss": 0.2653946876525879 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1634744256734848, "epoch": 2.93, "learning_rate": 1.0384041969077125e-06, "loss": 0.1678, "step": 8100, "task_loss": 0.37681901454925537 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17398503422737122, "epoch": 2.93, "learning_rate": 1.0365132275434175e-06, "loss": 0.1694, "step": 8110, "task_loss": 0.5321211814880371 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14301329851150513, "epoch": 2.93, "learning_rate": 1.0346221274329392e-06, "loss": 0.1627, "step": 8120, "task_loss": 0.2358737289905548 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19131416082382202, "epoch": 2.94, "learning_rate": 1.0327309033479087e-06, "loss": 0.1897, "step": 8130, "task_loss": 0.35499224066734314 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1476271152496338, "epoch": 2.94, "learning_rate": 1.0308395620604016e-06, "loss": 0.1682, "step": 8140, "task_loss": 0.42450767755508423 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17429782450199127, "epoch": 2.95, "learning_rate": 1.0289481103429135e-06, "loss": 0.1829, "step": 8150, "task_loss": 0.4377296566963196 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16124795377254486, "epoch": 2.95, "learning_rate": 1.0270565549683342e-06, "loss": 0.1786, "step": 8160, "task_loss": 0.3876572549343109 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15951263904571533, "epoch": 2.95, "learning_rate": 1.0251649027099262e-06, "loss": 0.1673, "step": 8170, "task_loss": 0.4029073119163513 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1840726137161255, "epoch": 2.96, "learning_rate": 1.0232731603412972e-06, "loss": 0.1927, "step": 8180, "task_loss": 0.4143099784851074 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18531106412410736, "epoch": 2.96, "learning_rate": 1.0213813346363792e-06, "loss": 0.1768, "step": 8190, "task_loss": 0.6813019514083862 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18536627292633057, "epoch": 2.96, "learning_rate": 1.0194894323694014e-06, "loss": 0.176, "step": 8200, "task_loss": 0.5788917541503906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1379977911710739, "epoch": 2.97, "learning_rate": 1.0175974603148683e-06, "loss": 0.1811, "step": 8210, "task_loss": 0.8245267271995544 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14147508144378662, "epoch": 2.97, "learning_rate": 1.0157054252475335e-06, "loss": 0.1668, "step": 8220, "task_loss": 0.48868924379348755 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17832288146018982, "epoch": 2.97, "learning_rate": 1.0138133339423757e-06, "loss": 0.1748, "step": 8230, "task_loss": 0.48949164152145386 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15996500849723816, "epoch": 2.98, "learning_rate": 1.0119211931745766e-06, "loss": 0.1719, "step": 8240, "task_loss": 0.4312325716018677 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15269511938095093, "epoch": 2.98, "learning_rate": 1.0100290097194932e-06, "loss": 0.1666, "step": 8250, "task_loss": 0.543391227722168 }, { "epoch": 2.98, "eval_exact_match": 83.62346263008514, "eval_f1": 90.03105862224157, "step": 8250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16537584364414215, "epoch": 2.99, "learning_rate": 1.0081367903526367e-06, "loss": 0.1708, "step": 8260, "task_loss": 0.23162506520748138 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1359395980834961, "epoch": 2.99, "learning_rate": 1.0062445418496466e-06, "loss": 0.1698, "step": 8270, "task_loss": 0.4973392188549042 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18227165937423706, "epoch": 2.99, "learning_rate": 1.0043522709862663e-06, "loss": 0.1872, "step": 8280, "task_loss": 0.4104633331298828 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1791878640651703, "epoch": 3.0, "learning_rate": 1.0024599845383195e-06, "loss": 0.1789, "step": 8290, "task_loss": 0.4813977777957916 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, "compression/movement_sparsity/importance_threshold": -Infinity, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15984641015529633, "epoch": 3.0, "learning_rate": 1.0005676892816859e-06, "loss": 0.1717, "step": 8300, "task_loss": 0.5337830781936646 }, { "compression/movement_sparsity/importance_regularization_factor": 8.050780939351022e-05, "compression/movement_sparsity/importance_threshold": -0.8842994279815715, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18141396343708038, "epoch": 3.0, "learning_rate": 9.986753919922763e-07, "loss": 0.1893, "step": 8310, "task_loss": 0.4816855788230896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00018104905245944604, "compression/movement_sparsity/importance_threshold": -0.8829299076613767, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17543792724609375, "epoch": 3.01, "learning_rate": 9.967830994460091e-07, "loss": 0.1674, "step": 8320, "task_loss": 0.7297599911689758 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0002814864359787489, "compression/movement_sparsity/importance_threshold": -0.8815618020617008, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14357587695121765, "epoch": 3.01, "learning_rate": 9.94890818418786e-07, "loss": 0.1749, "step": 8330, "task_loss": 0.4249134957790375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00038182001362283225, "compression/movement_sparsity/importance_threshold": -0.8801951104514598, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15991082787513733, "epoch": 3.01, "learning_rate": 9.929985556864669e-07, "loss": 0.162, "step": 8340, "task_loss": 0.3581671714782715 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00048204983906308785, "compression/movement_sparsity/importance_threshold": -0.87882983209957, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16143129765987396, "epoch": 3.02, "learning_rate": 9.911063180248462e-07, "loss": 0.1693, "step": 8350, "task_loss": 0.6364099383354187 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0005821759659709508, "compression/movement_sparsity/importance_threshold": -0.8774659662749472, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14541073143482208, "epoch": 3.02, "learning_rate": 9.89214112209629e-07, "loss": 0.1719, "step": 8360, "task_loss": 0.2846934497356415 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0006821984480178128, "compression/movement_sparsity/importance_threshold": -0.8761035122465077, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14429126679897308, "epoch": 3.02, "learning_rate": 9.873219450164061e-07, "loss": 0.1838, "step": 8370, "task_loss": 0.33501607179641724 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0007821173388750946, "compression/movement_sparsity/importance_threshold": -0.8747424692831676, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19137001037597656, "epoch": 3.03, "learning_rate": 9.854298232206296e-07, "loss": 0.1799, "step": 8380, "task_loss": 0.4200936555862427 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0008819326922142095, "compression/movement_sparsity/importance_threshold": -0.8733828366538425, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18857447803020477, "epoch": 3.03, "learning_rate": 9.835377535975905e-07, "loss": 0.1761, "step": 8390, "task_loss": 0.4611113667488098 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0009816445617065568, "compression/movement_sparsity/importance_threshold": -0.872024613627449, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.139084130525589, "epoch": 3.04, "learning_rate": 9.816457429223905e-07, "loss": 0.1636, "step": 8400, "task_loss": 0.5555346012115479 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0010812530010235349, "compression/movement_sparsity/importance_threshold": -0.8706677994729032, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12424302101135254, "epoch": 3.04, "learning_rate": 9.797537979699225e-07, "loss": 0.1682, "step": 8410, "task_loss": 0.3569965064525604 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0011807580638365867, "compression/movement_sparsity/importance_threshold": -0.8693123934591207, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13371171057224274, "epoch": 3.04, "learning_rate": 9.778619255148434e-07, "loss": 0.1677, "step": 8420, "task_loss": 0.4002540707588196 }, { "compression/movement_sparsity/importance_regularization_factor": 0.001280159803817118, "compression/movement_sparsity/importance_threshold": -0.8679583948550178, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15741753578186035, "epoch": 3.05, "learning_rate": 9.759701323315496e-07, "loss": 0.1722, "step": 8430, "task_loss": 0.4664173722267151 }, { "compression/movement_sparsity/importance_regularization_factor": 0.001379458274636528, "compression/movement_sparsity/importance_threshold": -0.8666058029295104, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16380411386489868, "epoch": 3.05, "learning_rate": 9.740784251941549e-07, "loss": 0.1842, "step": 8440, "task_loss": 0.3489740490913391 }, { "compression/movement_sparsity/importance_regularization_factor": 0.001478653529966223, "compression/movement_sparsity/importance_threshold": -0.865254616951515, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2031557559967041, "epoch": 3.05, "learning_rate": 9.721868108764637e-07, "loss": 0.1717, "step": 8450, "task_loss": 0.8436130881309509 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0015777456234776306, "compression/movement_sparsity/importance_threshold": -0.8639048361899471, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19330856204032898, "epoch": 3.06, "learning_rate": 9.702952961519502e-07, "loss": 0.1723, "step": 8460, "task_loss": 0.9914184808731079 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0016767346088421425, "compression/movement_sparsity/importance_threshold": -0.8625564599137232, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16919749975204468, "epoch": 3.06, "learning_rate": 9.68403887793729e-07, "loss": 0.1642, "step": 8470, "task_loss": 0.4527369737625122 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0017756205397311798, "compression/movement_sparsity/importance_threshold": -0.8612094873917593, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16218455135822296, "epoch": 3.06, "learning_rate": 9.66512592574536e-07, "loss": 0.1842, "step": 8480, "task_loss": 0.23674465715885162 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0018744034698161484, "compression/movement_sparsity/importance_threshold": -0.8598639178929713, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14123575389385223, "epoch": 3.07, "learning_rate": 9.646214172667018e-07, "loss": 0.1799, "step": 8490, "task_loss": 0.442488431930542 }, { "compression/movement_sparsity/importance_regularization_factor": 0.001973083452768469, "compression/movement_sparsity/importance_threshold": -0.8585197506862754, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13976861536502838, "epoch": 3.07, "learning_rate": 9.627303686421263e-07, "loss": 0.17, "step": 8500, "task_loss": 0.23940631747245789 }, { "epoch": 3.07, "eval_exact_match": 83.62346263008514, "eval_f1": 90.07971472160804, "step": 8500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.002071660542259533, "compression/movement_sparsity/importance_threshold": -0.8571769850405877, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15648941695690155, "epoch": 3.08, "learning_rate": 9.608394534722578e-07, "loss": 0.1894, "step": 8510, "task_loss": 0.293619304895401 }, { "compression/movement_sparsity/importance_regularization_factor": 0.002170134791960769, "compression/movement_sparsity/importance_threshold": -0.8558356202248242, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16870902478694916, "epoch": 3.08, "learning_rate": 9.58948678528064e-07, "loss": 0.1768, "step": 8520, "task_loss": 0.6857977509498596 }, { "compression/movement_sparsity/importance_regularization_factor": 0.002268506255543583, "compression/movement_sparsity/importance_threshold": -0.8544956555079009, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14667314291000366, "epoch": 3.08, "learning_rate": 9.570580505800134e-07, "loss": 0.1685, "step": 8530, "task_loss": 0.3974134922027588 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0023667749866793804, "compression/movement_sparsity/importance_threshold": -0.8531570901587339, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2171468436717987, "epoch": 3.09, "learning_rate": 9.551675763980463e-07, "loss": 0.1824, "step": 8540, "task_loss": 0.7033668160438538 }, { "compression/movement_sparsity/importance_regularization_factor": 0.002464941039039561, "compression/movement_sparsity/importance_threshold": -0.8518199234462395, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15260061621665955, "epoch": 3.09, "learning_rate": 9.532772627515527e-07, "loss": 0.1665, "step": 8550, "task_loss": 0.22714334726333618 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0025630044662955597, "compression/movement_sparsity/importance_threshold": -0.8504841546393335, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14580965042114258, "epoch": 3.09, "learning_rate": 9.513871164093483e-07, "loss": 0.1718, "step": 8560, "task_loss": 0.3902367949485779 }, { "compression/movement_sparsity/importance_regularization_factor": 0.002660965322118783, "compression/movement_sparsity/importance_threshold": -0.8491497830069319, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22626420855522156, "epoch": 3.1, "learning_rate": 9.494971441396488e-07, "loss": 0.1679, "step": 8570, "task_loss": 0.6860054731369019 }, { "compression/movement_sparsity/importance_regularization_factor": 0.002758823660180637, "compression/movement_sparsity/importance_threshold": -0.8478168078179509, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13325051963329315, "epoch": 3.1, "learning_rate": 9.476073527100477e-07, "loss": 0.1708, "step": 8580, "task_loss": 0.5953207612037659 }, { "compression/movement_sparsity/importance_regularization_factor": 0.002856579534152527, "compression/movement_sparsity/importance_threshold": -0.8464852283413066, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14980527758598328, "epoch": 3.1, "learning_rate": 9.457177488874907e-07, "loss": 0.175, "step": 8590, "task_loss": 0.5516312122344971 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0029542329977058674, "compression/movement_sparsity/importance_threshold": -0.845155043845915, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14336633682250977, "epoch": 3.11, "learning_rate": 9.438283394382505e-07, "loss": 0.1727, "step": 8600, "task_loss": 0.43941259384155273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0030517841045120714, "compression/movement_sparsity/importance_threshold": -0.8438262536006922, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.138885036110878, "epoch": 3.11, "learning_rate": 9.419391311279053e-07, "loss": 0.1608, "step": 8610, "task_loss": 0.4903583824634552 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0031492329082425455, "compression/movement_sparsity/importance_threshold": -0.8424988568745543, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14071306586265564, "epoch": 3.12, "learning_rate": 9.40050130721312e-07, "loss": 0.1671, "step": 8620, "task_loss": 0.4048214554786682 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0032465794625687024, "compression/movement_sparsity/importance_threshold": -0.8411728529364173, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16939450800418854, "epoch": 3.12, "learning_rate": 9.381613449825843e-07, "loss": 0.1777, "step": 8630, "task_loss": 0.5866584777832031 }, { "compression/movement_sparsity/importance_regularization_factor": 0.003343823821161949, "compression/movement_sparsity/importance_threshold": -0.8398482410551974, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11506274342536926, "epoch": 3.12, "learning_rate": 9.362727806750654e-07, "loss": 0.1697, "step": 8640, "task_loss": 0.45650506019592285 }, { "compression/movement_sparsity/importance_regularization_factor": 0.003440966037693698, "compression/movement_sparsity/importance_threshold": -0.8385250204998105, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18618786334991455, "epoch": 3.13, "learning_rate": 9.343844445613072e-07, "loss": 0.1852, "step": 8650, "task_loss": 0.341508150100708 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0035380061658353635, "compression/movement_sparsity/importance_threshold": -0.8372031905391728, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15694350004196167, "epoch": 3.13, "learning_rate": 9.324963434030442e-07, "loss": 0.1587, "step": 8660, "task_loss": 0.6575144529342651 }, { "compression/movement_sparsity/importance_regularization_factor": 0.003634944259258359, "compression/movement_sparsity/importance_threshold": -0.8358827504422001, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1654762327671051, "epoch": 3.13, "learning_rate": 9.306084839611687e-07, "loss": 0.1746, "step": 8670, "task_loss": 0.257068395614624 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0037317803716340825, "compression/movement_sparsity/importance_threshold": -0.8345636994778088, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1543157696723938, "epoch": 3.14, "learning_rate": 9.287208729957085e-07, "loss": 0.1791, "step": 8680, "task_loss": 0.6083182096481323 }, { "compression/movement_sparsity/importance_regularization_factor": 0.003828514556633934, "compression/movement_sparsity/importance_threshold": -0.8332460369149152, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15461325645446777, "epoch": 3.14, "learning_rate": 9.268335172658008e-07, "loss": 0.1743, "step": 8690, "task_loss": 0.3445891737937927 }, { "compression/movement_sparsity/importance_regularization_factor": 0.003925146867929356, "compression/movement_sparsity/importance_threshold": -0.8319297620224346, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14635170996189117, "epoch": 3.14, "learning_rate": 9.249464235296695e-07, "loss": 0.1698, "step": 8700, "task_loss": 0.3346456289291382 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004021677359191753, "compression/movement_sparsity/importance_threshold": -0.8306148740692835, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17073814570903778, "epoch": 3.15, "learning_rate": 9.230595985446003e-07, "loss": 0.1737, "step": 8710, "task_loss": 0.5876675844192505 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004118106084092519, "compression/movement_sparsity/importance_threshold": -0.8293013723243781, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20534959435462952, "epoch": 3.15, "learning_rate": 9.21173049066916e-07, "loss": 0.1693, "step": 8720, "task_loss": 0.382878839969635 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00421443309630308, "compression/movement_sparsity/importance_threshold": -0.827989256056634, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12774941325187683, "epoch": 3.16, "learning_rate": 9.192867818519535e-07, "loss": 0.1775, "step": 8730, "task_loss": 0.19287356734275818 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0043106584494948365, "compression/movement_sparsity/importance_threshold": -0.8266785245349677, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1298234462738037, "epoch": 3.16, "learning_rate": 9.174008036540384e-07, "loss": 0.1682, "step": 8740, "task_loss": 0.2842978239059448 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004406782197339201, "compression/movement_sparsity/importance_threshold": -0.8253691770282953, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1581944227218628, "epoch": 3.16, "learning_rate": 9.155151212264625e-07, "loss": 0.1603, "step": 8750, "task_loss": 0.26196685433387756 }, { "epoch": 3.16, "eval_exact_match": 83.49101229895932, "eval_f1": 89.91894198656578, "step": 8750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004502804393507587, "compression/movement_sparsity/importance_threshold": -0.8240612128055325, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13445620238780975, "epoch": 3.17, "learning_rate": 9.136297413214566e-07, "loss": 0.1729, "step": 8760, "task_loss": 0.1746811419725418 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004598725091671408, "compression/movement_sparsity/importance_threshold": -0.8227546311355955, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12342646718025208, "epoch": 3.17, "learning_rate": 9.1174467069017e-07, "loss": 0.1721, "step": 8770, "task_loss": 0.33239758014678955 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004694544345502063, "compression/movement_sparsity/importance_threshold": -0.8214494312874007, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15459150075912476, "epoch": 3.17, "learning_rate": 9.098599160826441e-07, "loss": 0.1722, "step": 8780, "task_loss": 0.3906312584877014 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004790262208670973, "compression/movement_sparsity/importance_threshold": -0.8201456125298637, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16902866959571838, "epoch": 3.18, "learning_rate": 9.079754842477879e-07, "loss": 0.1756, "step": 8790, "task_loss": 0.3516932725906372 }, { "compression/movement_sparsity/importance_regularization_factor": 0.004885878734849544, "compression/movement_sparsity/importance_threshold": -0.8188431741319009, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1674996316432953, "epoch": 3.18, "learning_rate": 9.060913819333559e-07, "loss": 0.1744, "step": 8800, "task_loss": 0.5219091176986694 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0049813939777091885, "compression/movement_sparsity/importance_threshold": -0.8175421153624282, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14921870827674866, "epoch": 3.18, "learning_rate": 9.042076158859214e-07, "loss": 0.1648, "step": 8810, "task_loss": 0.5738848447799683 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005076807990921315, "compression/movement_sparsity/importance_threshold": -0.8162424354903617, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14361397922039032, "epoch": 3.19, "learning_rate": 9.023241928508542e-07, "loss": 0.1649, "step": 8820, "task_loss": 0.5719636678695679 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005172120828157334, "compression/movement_sparsity/importance_threshold": -0.8149441337846175, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1588568091392517, "epoch": 3.19, "learning_rate": 9.004411195722965e-07, "loss": 0.1811, "step": 8830, "task_loss": 0.39752593636512756 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00526733254308864, "compression/movement_sparsity/importance_threshold": -0.8136472095141118, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2179379165172577, "epoch": 3.19, "learning_rate": 8.985584027931364e-07, "loss": 0.1771, "step": 8840, "task_loss": 0.455571711063385 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00536244318938668, "compression/movement_sparsity/importance_threshold": -0.8123516619477603, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15764448046684265, "epoch": 3.2, "learning_rate": 8.966760492549872e-07, "loss": 0.1683, "step": 8850, "task_loss": 0.5573818683624268 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005457452820722847, "compression/movement_sparsity/importance_threshold": -0.8110574903544794, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15336552262306213, "epoch": 3.2, "learning_rate": 8.947940656981603e-07, "loss": 0.1729, "step": 8860, "task_loss": 0.30682915449142456 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005552361490768549, "compression/movement_sparsity/importance_threshold": -0.8097646940031848, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15846428275108337, "epoch": 3.21, "learning_rate": 8.929124588616429e-07, "loss": 0.1706, "step": 8870, "task_loss": 0.5824536681175232 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005647169253195196, "compression/movement_sparsity/importance_threshold": -0.8084732721627931, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13658425211906433, "epoch": 3.21, "learning_rate": 8.910312354830736e-07, "loss": 0.1763, "step": 8880, "task_loss": 0.7418199777603149 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0057418761616742035, "compression/movement_sparsity/importance_threshold": -0.8071832241022199, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21933336555957794, "epoch": 3.21, "learning_rate": 8.891504022987165e-07, "loss": 0.1868, "step": 8890, "task_loss": 0.48175936937332153 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005836482269876977, "compression/movement_sparsity/importance_threshold": -0.8058945490903814, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12823690474033356, "epoch": 3.22, "learning_rate": 8.8726996604344e-07, "loss": 0.1711, "step": 8900, "task_loss": 0.35165947675704956 }, { "compression/movement_sparsity/importance_regularization_factor": 0.005930987631474931, "compression/movement_sparsity/importance_threshold": -0.8046072463961937, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1561659872531891, "epoch": 3.22, "learning_rate": 8.853899334506904e-07, "loss": 0.1773, "step": 8910, "task_loss": 0.23708105087280273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00602539230013947, "compression/movement_sparsity/importance_threshold": -0.803321315288573, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1587158590555191, "epoch": 3.22, "learning_rate": 8.835103112524691e-07, "loss": 0.1701, "step": 8920, "task_loss": 0.3867292106151581 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006119696329542016, "compression/movement_sparsity/importance_threshold": -0.8020367550364351, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13717815279960632, "epoch": 3.23, "learning_rate": 8.816311061793068e-07, "loss": 0.1711, "step": 8930, "task_loss": 0.31392791867256165 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006213899773353968, "compression/movement_sparsity/importance_threshold": -0.8007535649086963, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16188368201255798, "epoch": 3.23, "learning_rate": 8.79752324960242e-07, "loss": 0.1637, "step": 8940, "task_loss": 0.5714429616928101 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006308002685246738, "compression/movement_sparsity/importance_threshold": -0.7994717441742726, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1368107795715332, "epoch": 3.23, "learning_rate": 8.778739743227951e-07, "loss": 0.179, "step": 8950, "task_loss": 0.36976689100265503 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006402005118891741, "compression/movement_sparsity/importance_threshold": -0.7981912921020801, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1518588364124298, "epoch": 3.24, "learning_rate": 8.759960609929435e-07, "loss": 0.1639, "step": 8960, "task_loss": 0.6866006255149841 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00649590712796039, "compression/movement_sparsity/importance_threshold": -0.7969122079610346, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20307350158691406, "epoch": 3.24, "learning_rate": 8.741185916951006e-07, "loss": 0.1676, "step": 8970, "task_loss": 0.28236985206604004 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006589708766124069, "compression/movement_sparsity/importance_threshold": -0.7956344910200528, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16698408126831055, "epoch": 3.25, "learning_rate": 8.72241573152088e-07, "loss": 0.1652, "step": 8980, "task_loss": 0.30868786573410034 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006683410087054235, "compression/movement_sparsity/importance_threshold": -0.79435814054805, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15856650471687317, "epoch": 3.25, "learning_rate": 8.703650120851146e-07, "loss": 0.1839, "step": 8990, "task_loss": 0.16748939454555511 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006777011144422273, "compression/movement_sparsity/importance_threshold": -0.7930831558139426, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15102289617061615, "epoch": 3.25, "learning_rate": 8.684889152137508e-07, "loss": 0.1715, "step": 9000, "task_loss": 0.4023663103580475 }, { "epoch": 3.25, "eval_exact_match": 83.50047303689688, "eval_f1": 89.90574516831549, "step": 9000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.006870511991899589, "compression/movement_sparsity/importance_threshold": -0.7918095360866468, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12239585816860199, "epoch": 3.26, "learning_rate": 8.66613289255904e-07, "loss": 0.1694, "step": 9010, "task_loss": 0.3852020502090454 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00696391268315761, "compression/movement_sparsity/importance_threshold": -0.7905372806350783, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17271217703819275, "epoch": 3.26, "learning_rate": 8.647381409277966e-07, "loss": 0.1749, "step": 9020, "task_loss": 0.48539960384368896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007057213271867729, "compression/movement_sparsity/importance_threshold": -0.7892663887281537, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17906677722930908, "epoch": 3.26, "learning_rate": 8.628634769439398e-07, "loss": 0.172, "step": 9030, "task_loss": 0.44566771388053894 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007150413811701367, "compression/movement_sparsity/importance_threshold": -0.7879968596347886, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18354880809783936, "epoch": 3.27, "learning_rate": 8.609893040171112e-07, "loss": 0.1751, "step": 9040, "task_loss": 0.3899494707584381 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0072435143563299356, "compression/movement_sparsity/importance_threshold": -0.7867286926238993, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17823264002799988, "epoch": 3.27, "learning_rate": 8.59115628858329e-07, "loss": 0.1753, "step": 9050, "task_loss": 0.3264240622520447 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007336514959424835, "compression/movement_sparsity/importance_threshold": -0.7854618869644019, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16943280398845673, "epoch": 3.27, "learning_rate": 8.5724245817683e-07, "loss": 0.1781, "step": 9060, "task_loss": 0.31149184703826904 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007429415674657485, "compression/movement_sparsity/importance_threshold": -0.7841964419252122, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13426220417022705, "epoch": 3.28, "learning_rate": 8.553697986800444e-07, "loss": 0.1806, "step": 9070, "task_loss": 0.3615609109401703 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007522216555699301, "compression/movement_sparsity/importance_threshold": -0.7829323567752465, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14723095297813416, "epoch": 3.28, "learning_rate": 8.534976570735711e-07, "loss": 0.1751, "step": 9080, "task_loss": 0.2963823676109314 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007614917656221679, "compression/movement_sparsity/importance_threshold": -0.7816696307834208, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1296226531267166, "epoch": 3.29, "learning_rate": 8.516260400611559e-07, "loss": 0.1796, "step": 9090, "task_loss": 0.4440094828605652 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007707519029896036, "compression/movement_sparsity/importance_threshold": -0.7804082632186513, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1356373429298401, "epoch": 3.29, "learning_rate": 8.497549543446649e-07, "loss": 0.1669, "step": 9100, "task_loss": 0.3411458134651184 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0078000207303937814, "compression/movement_sparsity/importance_threshold": -0.7791482533498539, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15570493042469025, "epoch": 3.29, "learning_rate": 8.478844066240624e-07, "loss": 0.1928, "step": 9110, "task_loss": 0.35819387435913086 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007892422811386331, "compression/movement_sparsity/importance_threshold": -0.7778896004459447, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16601881384849548, "epoch": 3.3, "learning_rate": 8.460144035973866e-07, "loss": 0.1692, "step": 9120, "task_loss": 0.5100635290145874 }, { "compression/movement_sparsity/importance_regularization_factor": 0.007984725326545077, "compression/movement_sparsity/importance_threshold": -0.7766323037758399, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17528662085533142, "epoch": 3.3, "learning_rate": 8.44144951960724e-07, "loss": 0.1646, "step": 9130, "task_loss": 0.4419068396091461 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00807692832954146, "compression/movement_sparsity/importance_threshold": -0.7753763626084553, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13393300771713257, "epoch": 3.3, "learning_rate": 8.422760584081881e-07, "loss": 0.1774, "step": 9140, "task_loss": 0.16897660493850708 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008169031874046875, "compression/movement_sparsity/importance_threshold": -0.7741217762127072, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13365423679351807, "epoch": 3.31, "learning_rate": 8.404077296318928e-07, "loss": 0.1632, "step": 9150, "task_loss": 0.30446919798851013 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008261036013732738, "compression/movement_sparsity/importance_threshold": -0.7728685438575115, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2315337210893631, "epoch": 3.31, "learning_rate": 8.385399723219313e-07, "loss": 0.191, "step": 9160, "task_loss": 0.4761585593223572 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008352940802270453, "compression/movement_sparsity/importance_threshold": -0.7716166648117844, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16882847249507904, "epoch": 3.31, "learning_rate": 8.366727931663481e-07, "loss": 0.1695, "step": 9170, "task_loss": 0.2730327248573303 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008444746293331428, "compression/movement_sparsity/importance_threshold": -0.7703661383444418, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1654556542634964, "epoch": 3.32, "learning_rate": 8.348061988511194e-07, "loss": 0.1676, "step": 9180, "task_loss": 0.7155327796936035 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00853645254058708, "compression/movement_sparsity/importance_threshold": -0.7691169637244, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20778971910476685, "epoch": 3.32, "learning_rate": 8.329401960601273e-07, "loss": 0.1829, "step": 9190, "task_loss": 0.5636166930198669 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008628059597708822, "compression/movement_sparsity/importance_threshold": -0.7678691402205748, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16762343049049377, "epoch": 3.32, "learning_rate": 8.310747914751339e-07, "loss": 0.1736, "step": 9200, "task_loss": 0.32610780000686646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008719567518368061, "compression/movement_sparsity/importance_threshold": -0.7666226671018825, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1574648916721344, "epoch": 3.33, "learning_rate": 8.292099917757612e-07, "loss": 0.1756, "step": 9210, "task_loss": 0.43395519256591797 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0088109763562362, "compression/movement_sparsity/importance_threshold": -0.7653775436372391, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15642675757408142, "epoch": 3.33, "learning_rate": 8.273458036394641e-07, "loss": 0.1819, "step": 9220, "task_loss": 0.3005210757255554 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008902286164984664, "compression/movement_sparsity/importance_threshold": -0.7641337690955606, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15306097269058228, "epoch": 3.34, "learning_rate": 8.254822337415079e-07, "loss": 0.1736, "step": 9230, "task_loss": 0.3201584815979004 }, { "compression/movement_sparsity/importance_regularization_factor": 0.008993496998284847, "compression/movement_sparsity/importance_threshold": -0.7628913427457633, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11628354340791702, "epoch": 3.34, "learning_rate": 8.23619288754945e-07, "loss": 0.1631, "step": 9240, "task_loss": 0.29428863525390625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009084608909808174, "compression/movement_sparsity/importance_threshold": -0.7616502638567629, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13086993992328644, "epoch": 3.34, "learning_rate": 8.217569753505883e-07, "loss": 0.1668, "step": 9250, "task_loss": 0.3168758749961853 }, { "epoch": 3.34, "eval_exact_match": 83.52885525070955, "eval_f1": 89.96188168895662, "step": 9250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009175621953226057, "compression/movement_sparsity/importance_threshold": -0.7604105316974756, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1549527943134308, "epoch": 3.35, "learning_rate": 8.198953001969908e-07, "loss": 0.1774, "step": 9260, "task_loss": 0.35443389415740967 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009266536182209873, "compression/movement_sparsity/importance_threshold": -0.7591721455368179, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11498824506998062, "epoch": 3.35, "learning_rate": 8.180342699604192e-07, "loss": 0.1719, "step": 9270, "task_loss": 0.29143670201301575 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009357351650431077, "compression/movement_sparsity/importance_threshold": -0.7579351046437052, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1407342255115509, "epoch": 3.35, "learning_rate": 8.161738913048309e-07, "loss": 0.1843, "step": 9280, "task_loss": 0.46949753165245056 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009448068411561065, "compression/movement_sparsity/importance_threshold": -0.7566994082870538, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1577707827091217, "epoch": 3.36, "learning_rate": 8.145001131224242e-07, "loss": 0.1688, "step": 9290, "task_loss": 0.2751985192298889 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009538686519271247, "compression/movement_sparsity/importance_threshold": -0.7554650557357798, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14712263643741608, "epoch": 3.36, "learning_rate": 8.126409908215325e-07, "loss": 0.1608, "step": 9300, "task_loss": 0.28362199664115906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.00962920602723303, "compression/movement_sparsity/importance_threshold": -0.7542320462587994, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17634797096252441, "epoch": 3.36, "learning_rate": 8.107825394138224e-07, "loss": 0.1866, "step": 9310, "task_loss": 0.7741411924362183 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009719626989117821, "compression/movement_sparsity/importance_threshold": -0.7530003791250285, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1491977721452713, "epoch": 3.37, "learning_rate": 8.089247655540163e-07, "loss": 0.175, "step": 9320, "task_loss": 0.3922104239463806 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009809949458597039, "compression/movement_sparsity/importance_threshold": -0.7517700536033832, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16116444766521454, "epoch": 3.37, "learning_rate": 8.070676758944122e-07, "loss": 0.1655, "step": 9330, "task_loss": 0.5367953777313232 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009900173489342093, "compression/movement_sparsity/importance_threshold": -0.7505410689627796, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1759243607521057, "epoch": 3.38, "learning_rate": 8.052112770848568e-07, "loss": 0.1672, "step": 9340, "task_loss": 0.2593398094177246 }, { "compression/movement_sparsity/importance_regularization_factor": 0.009990299135024385, "compression/movement_sparsity/importance_threshold": -0.7493134244721338, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17565186321735382, "epoch": 3.38, "learning_rate": 8.033555757727237e-07, "loss": 0.1661, "step": 9350, "task_loss": 0.4808293581008911 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01008032644931534, "compression/movement_sparsity/importance_threshold": -0.7480871194003618, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18272629380226135, "epoch": 3.38, "learning_rate": 8.015005786028893e-07, "loss": 0.1827, "step": 9360, "task_loss": 0.47488927841186523 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010170255485886355, "compression/movement_sparsity/importance_threshold": -0.7468621530163796, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17263399064540863, "epoch": 3.39, "learning_rate": 7.996462922177072e-07, "loss": 0.1621, "step": 9370, "task_loss": 0.4343196749687195 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010260086298408851, "compression/movement_sparsity/importance_threshold": -0.7456385245891034, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16438047587871552, "epoch": 3.39, "learning_rate": 7.977927232569877e-07, "loss": 0.1699, "step": 9380, "task_loss": 0.4834184944629669 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010349818940554234, "compression/movement_sparsity/importance_threshold": -0.7444162333874491, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19111394882202148, "epoch": 3.39, "learning_rate": 7.959398783579698e-07, "loss": 0.1637, "step": 9390, "task_loss": 0.6939228177070618 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01043945346599391, "compression/movement_sparsity/importance_threshold": -0.7431952786803331, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1576661765575409, "epoch": 3.4, "learning_rate": 7.940877641553021e-07, "loss": 0.1579, "step": 9400, "task_loss": 0.4254157543182373 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0105289899283993, "compression/movement_sparsity/importance_threshold": -0.7419756597366711, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13320744037628174, "epoch": 3.4, "learning_rate": 7.922363872810159e-07, "loss": 0.1813, "step": 9410, "task_loss": 0.3047106862068176 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010618428381441787, "compression/movement_sparsity/importance_threshold": -0.7407573758253796, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16407856345176697, "epoch": 3.4, "learning_rate": 7.903857543645014e-07, "loss": 0.1708, "step": 9420, "task_loss": 0.44201624393463135 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010707768878792823, "compression/movement_sparsity/importance_threshold": -0.7395404262153742, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16521432995796204, "epoch": 3.41, "learning_rate": 7.885358720324865e-07, "loss": 0.1759, "step": 9430, "task_loss": 0.3065508008003235 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010797011474123798, "compression/movement_sparsity/importance_threshold": -0.7383248101755712, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1531199812889099, "epoch": 3.41, "learning_rate": 7.866867469090096e-07, "loss": 0.1814, "step": 9440, "task_loss": 0.6985189318656921 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010886156221106119, "compression/movement_sparsity/importance_threshold": -0.7371105269748865, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14802789688110352, "epoch": 3.42, "learning_rate": 7.848383856153991e-07, "loss": 0.1597, "step": 9450, "task_loss": 0.40684616565704346 }, { "compression/movement_sparsity/importance_regularization_factor": 0.010975203173411207, "compression/movement_sparsity/importance_threshold": -0.7358975758822364, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1340632140636444, "epoch": 3.42, "learning_rate": 7.829907947702478e-07, "loss": 0.1784, "step": 9460, "task_loss": 0.5496231317520142 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011064152384710467, "compression/movement_sparsity/importance_threshold": -0.7346859561665366, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12589387595653534, "epoch": 3.42, "learning_rate": 7.811439809893896e-07, "loss": 0.162, "step": 9470, "task_loss": 0.3624046742916107 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011153003908675308, "compression/movement_sparsity/importance_threshold": -0.7334756670967038, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1687452495098114, "epoch": 3.43, "learning_rate": 7.792979508858765e-07, "loss": 0.1767, "step": 9480, "task_loss": 0.4035765826702118 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01124175779897714, "compression/movement_sparsity/importance_threshold": -0.7322667079416534, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16020125150680542, "epoch": 3.43, "learning_rate": 7.774527110699527e-07, "loss": 0.1603, "step": 9490, "task_loss": 0.41458970308303833 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011330414109287378, "compression/movement_sparsity/importance_threshold": -0.7310590779703019, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.132632777094841, "epoch": 3.43, "learning_rate": 7.756082681490345e-07, "loss": 0.1682, "step": 9500, "task_loss": 0.3918096423149109 }, { "epoch": 3.43, "eval_exact_match": 83.57615894039735, "eval_f1": 89.99568658968761, "step": 9500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011418972893277427, "compression/movement_sparsity/importance_threshold": -0.7298527764515652, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14293313026428223, "epoch": 3.44, "learning_rate": 7.737646287276834e-07, "loss": 0.1846, "step": 9510, "task_loss": 0.34345972537994385 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011507434204618703, "compression/movement_sparsity/importance_threshold": -0.7286478026543594, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.158808171749115, "epoch": 3.44, "learning_rate": 7.719217994075842e-07, "loss": 0.1825, "step": 9520, "task_loss": 0.3061710596084595 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011595798096982617, "compression/movement_sparsity/importance_threshold": -0.7274441558476004, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1777794063091278, "epoch": 3.44, "learning_rate": 7.700797867875215e-07, "loss": 0.1779, "step": 9530, "task_loss": 0.43014687299728394 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011684064624040575, "compression/movement_sparsity/importance_threshold": -0.7262418353002045, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14890308678150177, "epoch": 3.45, "learning_rate": 7.682385974633539e-07, "loss": 0.1667, "step": 9540, "task_loss": 0.4941771626472473 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011772233839463965, "compression/movement_sparsity/importance_threshold": -0.7250408402810881, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13499748706817627, "epoch": 3.45, "learning_rate": 7.663982380279936e-07, "loss": 0.1855, "step": 9550, "task_loss": 0.5359944105148315 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011860305796924247, "compression/movement_sparsity/importance_threshold": -0.7238411700591665, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17753419280052185, "epoch": 3.46, "learning_rate": 7.645587150713797e-07, "loss": 0.1758, "step": 9560, "task_loss": 0.2920956611633301 }, { "compression/movement_sparsity/importance_regularization_factor": 0.011948280550092807, "compression/movement_sparsity/importance_threshold": -0.722642823903356, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1481466293334961, "epoch": 3.46, "learning_rate": 7.627200351804573e-07, "loss": 0.167, "step": 9570, "task_loss": 0.4186212122440338 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012036158152641053, "compression/movement_sparsity/importance_threshold": -0.721445801082573, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1611461639404297, "epoch": 3.46, "learning_rate": 7.608822049391522e-07, "loss": 0.1624, "step": 9580, "task_loss": 0.5867197513580322 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012123938658240395, "compression/movement_sparsity/importance_threshold": -0.7202501008657333, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17020216584205627, "epoch": 3.47, "learning_rate": 7.590452309283473e-07, "loss": 0.1777, "step": 9590, "task_loss": 0.5484004020690918 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012211622120562255, "compression/movement_sparsity/importance_threshold": -0.7190557225217529, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16823932528495789, "epoch": 3.47, "learning_rate": 7.572091197258605e-07, "loss": 0.1712, "step": 9600, "task_loss": 0.42677053809165955 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012299208593278026, "compression/movement_sparsity/importance_threshold": -0.7178626653195481, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1810576617717743, "epoch": 3.47, "learning_rate": 7.553738779064191e-07, "loss": 0.1867, "step": 9610, "task_loss": 0.6101388931274414 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012386698130059135, "compression/movement_sparsity/importance_threshold": -0.7166709285280347, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15320083498954773, "epoch": 3.48, "learning_rate": 7.53539512041638e-07, "loss": 0.1972, "step": 9620, "task_loss": 0.45958250761032104 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012474090784576982, "compression/movement_sparsity/importance_threshold": -0.7154805114161291, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17546682059764862, "epoch": 3.48, "learning_rate": 7.51706028699995e-07, "loss": 0.1772, "step": 9630, "task_loss": 0.44312983751296997 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012561386610502979, "compression/movement_sparsity/importance_threshold": -0.7142914132527471, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1869005411863327, "epoch": 3.48, "learning_rate": 7.498734344468085e-07, "loss": 0.1721, "step": 9640, "task_loss": 0.4443809986114502 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012648585661508538, "compression/movement_sparsity/importance_threshold": -0.713103633306805, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21304626762866974, "epoch": 3.49, "learning_rate": 7.480417358442131e-07, "loss": 0.1735, "step": 9650, "task_loss": 0.4839943051338196 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012735687991265069, "compression/movement_sparsity/importance_threshold": -0.7119171708472186, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1490074098110199, "epoch": 3.49, "learning_rate": 7.462109394511352e-07, "loss": 0.1827, "step": 9660, "task_loss": 0.6885297298431396 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01282269365344399, "compression/movement_sparsity/importance_threshold": -0.7107320251429041, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14718842506408691, "epoch": 3.49, "learning_rate": 7.443810518232723e-07, "loss": 0.1786, "step": 9670, "task_loss": 0.34075993299484253 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012909602701716701, "compression/movement_sparsity/importance_threshold": -0.7095481954627776, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2005831003189087, "epoch": 3.5, "learning_rate": 7.425520795130658e-07, "loss": 0.1759, "step": 9680, "task_loss": 0.5733194351196289 }, { "compression/movement_sparsity/importance_regularization_factor": 0.012996415189754616, "compression/movement_sparsity/importance_threshold": -0.7083656810757551, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13851837813854218, "epoch": 3.5, "learning_rate": 7.407240290696813e-07, "loss": 0.1682, "step": 9690, "task_loss": 0.24197955429553986 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013083131171229131, "compression/movement_sparsity/importance_threshold": -0.707184481250753, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15892581641674042, "epoch": 3.51, "learning_rate": 7.388969070389828e-07, "loss": 0.1781, "step": 9700, "task_loss": 0.3478658199310303 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013169750699811683, "compression/movement_sparsity/importance_threshold": -0.7060045952566868, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13390114903450012, "epoch": 3.51, "learning_rate": 7.370707199635094e-07, "loss": 0.1746, "step": 9710, "task_loss": 0.46922826766967773 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01325627382917368, "compression/movement_sparsity/importance_threshold": -0.7048260223624727, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14467637240886688, "epoch": 3.51, "learning_rate": 7.352454743824531e-07, "loss": 0.16, "step": 9720, "task_loss": 0.2225886881351471 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01334270061298652, "compression/movement_sparsity/importance_threshold": -0.7036487618370271, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14436565339565277, "epoch": 3.52, "learning_rate": 7.334211768316338e-07, "loss": 0.1733, "step": 9730, "task_loss": 0.7009656429290771 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013429031104921623, "compression/movement_sparsity/importance_threshold": -0.7024728129492657, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1612703502178192, "epoch": 3.52, "learning_rate": 7.315978338434773e-07, "loss": 0.1794, "step": 9740, "task_loss": 0.45459383726119995 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013515265358650386, "compression/movement_sparsity/importance_threshold": -0.7012981749681048, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1257527470588684, "epoch": 3.52, "learning_rate": 7.297754519469909e-07, "loss": 0.1659, "step": 9750, "task_loss": 0.6451063752174377 }, { "epoch": 3.52, "eval_exact_match": 83.50993377483444, "eval_f1": 89.97423247909468, "step": 9750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013601403427844234, "compression/movement_sparsity/importance_threshold": -0.7001248471624604, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1723412126302719, "epoch": 3.53, "learning_rate": 7.279540376677407e-07, "loss": 0.1644, "step": 9760, "task_loss": 0.5570217370986938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01368744536617457, "compression/movement_sparsity/importance_threshold": -0.6989528288012485, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1490807831287384, "epoch": 3.53, "learning_rate": 7.263155975197417e-07, "loss": 0.1819, "step": 9770, "task_loss": 0.36897680163383484 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0137733912273128, "compression/movement_sparsity/importance_threshold": -0.6977821191533853, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15225452184677124, "epoch": 3.53, "learning_rate": 7.244960396787469e-07, "loss": 0.1688, "step": 9780, "task_loss": 0.27469679713249207 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013859241064930345, "compression/movement_sparsity/importance_threshold": -0.6966127174877867, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.161770761013031, "epoch": 3.54, "learning_rate": 7.226774683594532e-07, "loss": 0.1737, "step": 9790, "task_loss": 0.562671422958374 }, { "compression/movement_sparsity/importance_regularization_factor": 0.013944994932698615, "compression/movement_sparsity/importance_threshold": -0.695444623073369, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1080978512763977, "epoch": 3.54, "learning_rate": 7.208598900737806e-07, "loss": 0.1776, "step": 9800, "task_loss": 0.15812508761882782 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014030652884289011, "compression/movement_sparsity/importance_threshold": -0.6942778351790481, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12826144695281982, "epoch": 3.55, "learning_rate": 7.190433113300952e-07, "loss": 0.1663, "step": 9810, "task_loss": 0.3542582392692566 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014116214973372957, "compression/movement_sparsity/importance_threshold": -0.69311235307374, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16906192898750305, "epoch": 3.55, "learning_rate": 7.172277386331832e-07, "loss": 0.1832, "step": 9820, "task_loss": 0.47162115573883057 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01420168125362185, "compression/movement_sparsity/importance_threshold": -0.6919481760263608, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16506068408489227, "epoch": 3.55, "learning_rate": 7.154131784842279e-07, "loss": 0.1837, "step": 9830, "task_loss": 0.5075066089630127 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014287051778707111, "compression/movement_sparsity/importance_threshold": -0.6907853033058268, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18274077773094177, "epoch": 3.56, "learning_rate": 7.13599637380788e-07, "loss": 0.1698, "step": 9840, "task_loss": 0.5426364541053772 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014372326602300119, "compression/movement_sparsity/importance_threshold": -0.6896237341810542, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1831640601158142, "epoch": 3.56, "learning_rate": 7.117871218167716e-07, "loss": 0.1616, "step": 9850, "task_loss": 0.7144515514373779 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014457505778072335, "compression/movement_sparsity/importance_threshold": -0.6884634679209584, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1759902834892273, "epoch": 3.56, "learning_rate": 7.09975638282416e-07, "loss": 0.1769, "step": 9860, "task_loss": 0.35797786712646484 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014542589359695154, "compression/movement_sparsity/importance_threshold": -0.6873045037944556, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1537725031375885, "epoch": 3.57, "learning_rate": 7.081651932642628e-07, "loss": 0.1887, "step": 9870, "task_loss": 0.23532593250274658 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014627577400839973, "compression/movement_sparsity/importance_threshold": -0.6861468410704624, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17610417306423187, "epoch": 3.57, "learning_rate": 7.06355793245134e-07, "loss": 0.164, "step": 9880, "task_loss": 0.4238791763782501 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014712469955178206, "compression/movement_sparsity/importance_threshold": -0.6849904790178946, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.191780686378479, "epoch": 3.57, "learning_rate": 7.045474447041106e-07, "loss": 0.1914, "step": 9890, "task_loss": 0.7447193264961243 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014797267076381265, "compression/movement_sparsity/importance_threshold": -0.6838354169056681, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18503516912460327, "epoch": 3.58, "learning_rate": 7.027401541165079e-07, "loss": 0.1899, "step": 9900, "task_loss": 0.3769063651561737 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014881968818120567, "compression/movement_sparsity/importance_threshold": -0.6826816540026991, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16122739017009735, "epoch": 3.58, "learning_rate": 7.009339279538536e-07, "loss": 0.1648, "step": 9910, "task_loss": 0.8050833344459534 }, { "compression/movement_sparsity/importance_regularization_factor": 0.014966575234067515, "compression/movement_sparsity/importance_threshold": -0.6815291895779036, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17400379478931427, "epoch": 3.59, "learning_rate": 6.991287726838628e-07, "loss": 0.192, "step": 9920, "task_loss": 0.508493959903717 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015051086377893522, "compression/movement_sparsity/importance_threshold": -0.6803780229001979, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15540730953216553, "epoch": 3.59, "learning_rate": 6.973246947704171e-07, "loss": 0.1824, "step": 9930, "task_loss": 0.4139855206012726 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015135502303269998, "compression/movement_sparsity/importance_threshold": -0.6792281532384978, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15979820489883423, "epoch": 3.59, "learning_rate": 6.955217006735398e-07, "loss": 0.1839, "step": 9940, "task_loss": 0.5479603409767151 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015219823063868352, "compression/movement_sparsity/importance_threshold": -0.6780795798617196, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21421319246292114, "epoch": 3.6, "learning_rate": 6.937197968493731e-07, "loss": 0.1756, "step": 9950, "task_loss": 0.8114826083183289 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01530404871336, "compression/movement_sparsity/importance_threshold": -0.6769323020387791, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18751713633537292, "epoch": 3.6, "learning_rate": 6.919189897501558e-07, "loss": 0.182, "step": 9960, "task_loss": 0.3587406575679779 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015388179305416349, "compression/movement_sparsity/importance_threshold": -0.6757863190385925, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18434390425682068, "epoch": 3.6, "learning_rate": 6.901192858241987e-07, "loss": 0.1779, "step": 9970, "task_loss": 0.2552274465560913 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01547221489370881, "compression/movement_sparsity/importance_threshold": -0.6746416301300759, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14075002074241638, "epoch": 3.61, "learning_rate": 6.883206915158626e-07, "loss": 0.1716, "step": 9980, "task_loss": 0.3761569857597351 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015556155531908776, "compression/movement_sparsity/importance_threshold": -0.6734982345821456, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19011226296424866, "epoch": 3.61, "learning_rate": 6.865232132655361e-07, "loss": 0.1753, "step": 9990, "task_loss": 0.5310980081558228 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015640001273687695, "compression/movement_sparsity/importance_threshold": -0.6723561316637171, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1842782199382782, "epoch": 3.61, "learning_rate": 6.847268575096094e-07, "loss": 0.1791, "step": 10000, "task_loss": 0.4414299428462982 }, { "epoch": 3.61, "eval_exact_match": 83.61400189214758, "eval_f1": 90.00351389360986, "step": 10000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015723752172716956, "compression/movement_sparsity/importance_threshold": -0.6712153206437068, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17395688593387604, "epoch": 3.62, "learning_rate": 6.829316306804554e-07, "loss": 0.1791, "step": 10010, "task_loss": 0.4833320379257202 }, { "compression/movement_sparsity/importance_regularization_factor": 0.015807408282667967, "compression/movement_sparsity/importance_threshold": -0.6700758007910308, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13925310969352722, "epoch": 3.62, "learning_rate": 6.811375392064027e-07, "loss": 0.1643, "step": 10020, "task_loss": 0.34654271602630615 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01589096965721215, "compression/movement_sparsity/importance_threshold": -0.6689375713746049, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1650667041540146, "epoch": 3.62, "learning_rate": 6.793445895117156e-07, "loss": 0.1744, "step": 10030, "task_loss": 0.395840048789978 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0159744363500209, "compression/movement_sparsity/importance_threshold": -0.6678006316633456, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16980409622192383, "epoch": 3.63, "learning_rate": 6.775527880165703e-07, "loss": 0.1817, "step": 10040, "task_loss": 0.44540518522262573 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016057808414765642, "compression/movement_sparsity/importance_threshold": -0.6666649809261685, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.181473508477211, "epoch": 3.63, "learning_rate": 6.7576214113703e-07, "loss": 0.1781, "step": 10050, "task_loss": 0.4364219307899475 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016141085905117782, "compression/movement_sparsity/importance_threshold": -0.66553061843199, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1557856649160385, "epoch": 3.64, "learning_rate": 6.739726552850247e-07, "loss": 0.1582, "step": 10060, "task_loss": 0.31843990087509155 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016224268874748718, "compression/movement_sparsity/importance_threshold": -0.6643975434497262, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1424463987350464, "epoch": 3.64, "learning_rate": 6.721843368683263e-07, "loss": 0.1705, "step": 10070, "task_loss": 0.5333583354949951 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016307357377329884, "compression/movement_sparsity/importance_threshold": -0.663265755248293, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17959821224212646, "epoch": 3.64, "learning_rate": 6.703971922905274e-07, "loss": 0.182, "step": 10080, "task_loss": 0.6014090776443481 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016390351466532678, "compression/movement_sparsity/importance_threshold": -0.6621352530966064, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1679791957139969, "epoch": 3.65, "learning_rate": 6.686112279510157e-07, "loss": 0.1826, "step": 10090, "task_loss": 0.3521926999092102 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016473251196028507, "compression/movement_sparsity/importance_threshold": -0.6610060362635826, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14580708742141724, "epoch": 3.65, "learning_rate": 6.668264502449541e-07, "loss": 0.1765, "step": 10100, "task_loss": 0.39751163125038147 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016556056619488783, "compression/movement_sparsity/importance_threshold": -0.6598781040181376, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13637031614780426, "epoch": 3.65, "learning_rate": 6.650428655632563e-07, "loss": 0.1668, "step": 10110, "task_loss": 0.51078200340271 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01663876779058492, "compression/movement_sparsity/importance_threshold": -0.6587514556291877, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15260596573352814, "epoch": 3.66, "learning_rate": 6.63260480292563e-07, "loss": 0.1765, "step": 10120, "task_loss": 0.2957655191421509 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016721384762988314, "compression/movement_sparsity/importance_threshold": -0.6576260903656488, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1376221477985382, "epoch": 3.66, "learning_rate": 6.614793008152212e-07, "loss": 0.1603, "step": 10130, "task_loss": 0.33394917845726013 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016803907590370405, "compression/movement_sparsity/importance_threshold": -0.6565020074964367, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16349947452545166, "epoch": 3.66, "learning_rate": 6.596993335092593e-07, "loss": 0.1774, "step": 10140, "task_loss": 0.5207610726356506 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016886336326402594, "compression/movement_sparsity/importance_threshold": -0.6553792062904678, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15152761340141296, "epoch": 3.67, "learning_rate": 6.57920584748366e-07, "loss": 0.1672, "step": 10150, "task_loss": 0.5067110657691956 }, { "compression/movement_sparsity/importance_regularization_factor": 0.016968671024756275, "compression/movement_sparsity/importance_threshold": -0.6542576860166581, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15529385209083557, "epoch": 3.67, "learning_rate": 6.561430609018667e-07, "loss": 0.1685, "step": 10160, "task_loss": 0.34697654843330383 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017050911739102878, "compression/movement_sparsity/importance_threshold": -0.6531374459439235, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19189101457595825, "epoch": 3.68, "learning_rate": 6.543667683346991e-07, "loss": 0.1928, "step": 10170, "task_loss": 0.45649850368499756 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0171330585231138, "compression/movement_sparsity/importance_threshold": -0.6520184853411803, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16808871924877167, "epoch": 3.68, "learning_rate": 6.52591713407394e-07, "loss": 0.1602, "step": 10180, "task_loss": 0.37760788202285767 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017215111430460457, "compression/movement_sparsity/importance_threshold": -0.6509008034773445, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1413610577583313, "epoch": 3.68, "learning_rate": 6.508179024760487e-07, "loss": 0.1723, "step": 10190, "task_loss": 0.2622392773628235 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01729707051481426, "compression/movement_sparsity/importance_threshold": -0.649784399621332, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1771795153617859, "epoch": 3.69, "learning_rate": 6.49045341892308e-07, "loss": 0.1769, "step": 10200, "task_loss": 0.6021856069564819 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017378935829846617, "compression/movement_sparsity/importance_threshold": -0.6486692730420591, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1472265124320984, "epoch": 3.69, "learning_rate": 6.47274038003337e-07, "loss": 0.1614, "step": 10210, "task_loss": 0.36952024698257446 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017460707429228936, "compression/movement_sparsity/importance_threshold": -0.6475554230084418, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12765908241271973, "epoch": 3.69, "learning_rate": 6.455039971518029e-07, "loss": 0.1698, "step": 10220, "task_loss": 0.42384105920791626 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017542385366632635, "compression/movement_sparsity/importance_threshold": -0.6464428487893962, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17819051444530487, "epoch": 3.7, "learning_rate": 6.437352256758495e-07, "loss": 0.1721, "step": 10230, "task_loss": 0.4150475859642029 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017623969695729123, "compression/movement_sparsity/importance_threshold": -0.6453315496538382, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1502448171377182, "epoch": 3.7, "learning_rate": 6.419677299090748e-07, "loss": 0.1764, "step": 10240, "task_loss": 0.268942266702652 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017705460470189813, "compression/movement_sparsity/importance_threshold": -0.644221524870684, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22100843489170074, "epoch": 3.7, "learning_rate": 6.402015161805097e-07, "loss": 0.1917, "step": 10250, "task_loss": 0.2643333077430725 }, { "epoch": 3.7, "eval_exact_match": 83.61400189214758, "eval_f1": 89.94770097613869, "step": 10250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01778685774368611, "compression/movement_sparsity/importance_threshold": -0.6431127737088496, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1667742282152176, "epoch": 3.71, "learning_rate": 6.384365908145933e-07, "loss": 0.1676, "step": 10260, "task_loss": 0.3106589913368225 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017868161569889417, "compression/movement_sparsity/importance_threshold": -0.6420052954372512, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15289315581321716, "epoch": 3.71, "learning_rate": 6.366729601311521e-07, "loss": 0.1596, "step": 10270, "task_loss": 0.24741002917289734 }, { "compression/movement_sparsity/importance_regularization_factor": 0.017949372002471143, "compression/movement_sparsity/importance_threshold": -0.6408990893248049, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17078140377998352, "epoch": 3.72, "learning_rate": 6.349106304453769e-07, "loss": 0.1687, "step": 10280, "task_loss": 0.1850242167711258 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018030489095102725, "compression/movement_sparsity/importance_threshold": -0.6397941546404264, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15359310805797577, "epoch": 3.72, "learning_rate": 6.331496080677985e-07, "loss": 0.1632, "step": 10290, "task_loss": 0.2576896548271179 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01811151290145556, "compression/movement_sparsity/importance_threshold": -0.6386904906530321, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1645045280456543, "epoch": 3.72, "learning_rate": 6.313898993042681e-07, "loss": 0.177, "step": 10300, "task_loss": 0.43283289670944214 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018192443475201055, "compression/movement_sparsity/importance_threshold": -0.6375880966315379, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1887405514717102, "epoch": 3.73, "learning_rate": 6.29631510455932e-07, "loss": 0.1839, "step": 10310, "task_loss": 0.36919164657592773 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018273280870010622, "compression/movement_sparsity/importance_threshold": -0.63648697184486, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17336630821228027, "epoch": 3.73, "learning_rate": 6.278744478192113e-07, "loss": 0.172, "step": 10320, "task_loss": 0.5593419075012207 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018354025139555672, "compression/movement_sparsity/importance_threshold": -0.6353871155619144, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18160130083560944, "epoch": 3.73, "learning_rate": 6.261187176857765e-07, "loss": 0.17, "step": 10330, "task_loss": 0.3647115230560303 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018434676337507616, "compression/movement_sparsity/importance_threshold": -0.6342885270516171, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1591804027557373, "epoch": 3.74, "learning_rate": 6.243643263425285e-07, "loss": 0.1672, "step": 10340, "task_loss": 0.6932095289230347 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018515234517537865, "compression/movement_sparsity/importance_threshold": -0.6331912055828843, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17311957478523254, "epoch": 3.74, "learning_rate": 6.226112800715733e-07, "loss": 0.1676, "step": 10350, "task_loss": 0.2462514042854309 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018595699733317827, "compression/movement_sparsity/importance_threshold": -0.632095150424632, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14102676510810852, "epoch": 3.74, "learning_rate": 6.208595851502003e-07, "loss": 0.1857, "step": 10360, "task_loss": 0.4470866024494171 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01867607203851891, "compression/movement_sparsity/importance_threshold": -0.6310003608457764, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15304112434387207, "epoch": 3.75, "learning_rate": 6.191092478508611e-07, "loss": 0.1751, "step": 10370, "task_loss": 0.49549001455307007 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018756351486812525, "compression/movement_sparsity/importance_threshold": -0.6299068361152333, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17790237069129944, "epoch": 3.75, "learning_rate": 6.173602744411445e-07, "loss": 0.1745, "step": 10380, "task_loss": 0.38540562987327576 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018836538131870098, "compression/movement_sparsity/importance_threshold": -0.628814575501919, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18003492057323456, "epoch": 3.75, "learning_rate": 6.156126711837563e-07, "loss": 0.1702, "step": 10390, "task_loss": 0.5601201057434082 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018916632027363017, "compression/movement_sparsity/importance_threshold": -0.6277235782747494, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19177278876304626, "epoch": 3.76, "learning_rate": 6.138664443364964e-07, "loss": 0.1847, "step": 10400, "task_loss": 0.6107884049415588 }, { "compression/movement_sparsity/importance_regularization_factor": 0.018996633226962715, "compression/movement_sparsity/importance_threshold": -0.6266338437026406, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15945954620838165, "epoch": 3.76, "learning_rate": 6.121216001522353e-07, "loss": 0.1776, "step": 10410, "task_loss": 0.47822341322898865 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019076541784340567, "compression/movement_sparsity/importance_threshold": -0.625545371054509, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17401601374149323, "epoch": 3.77, "learning_rate": 6.103781448788929e-07, "loss": 0.1751, "step": 10420, "task_loss": 0.3851965665817261 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019156357753168023, "compression/movement_sparsity/importance_threshold": -0.6244581595992702, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1592417061328888, "epoch": 3.77, "learning_rate": 6.086360847594153e-07, "loss": 0.1729, "step": 10430, "task_loss": 0.5951048135757446 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01923608118711648, "compression/movement_sparsity/importance_threshold": -0.6233722086058404, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17043137550354004, "epoch": 3.77, "learning_rate": 6.068954260317535e-07, "loss": 0.1583, "step": 10440, "task_loss": 0.355490505695343 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01931571213985735, "compression/movement_sparsity/importance_threshold": -0.6222875173431357, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1629900336265564, "epoch": 3.78, "learning_rate": 6.051561749288404e-07, "loss": 0.1891, "step": 10450, "task_loss": 0.36418741941452026 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019395250665062033, "compression/movement_sparsity/importance_threshold": -0.6212040850800722, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1411941647529602, "epoch": 3.78, "learning_rate": 6.034183376785675e-07, "loss": 0.1646, "step": 10460, "task_loss": 0.45510485768318176 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019474696816401953, "compression/movement_sparsity/importance_threshold": -0.620121911085566, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15264548361301422, "epoch": 3.78, "learning_rate": 6.016819205037645e-07, "loss": 0.1679, "step": 10470, "task_loss": 0.4522840976715088 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019554050647548515, "compression/movement_sparsity/importance_threshold": -0.6190409946285329, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16963550448417664, "epoch": 3.79, "learning_rate": 5.999469296221759e-07, "loss": 0.1836, "step": 10480, "task_loss": 0.4056473970413208 }, { "compression/movement_sparsity/importance_regularization_factor": 0.019633312212173126, "compression/movement_sparsity/importance_threshold": -0.6179613349778894, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15899108350276947, "epoch": 3.79, "learning_rate": 5.982133712464392e-07, "loss": 0.1595, "step": 10490, "task_loss": 0.291488379240036 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0197124815639472, "compression/movement_sparsity/importance_threshold": -0.6168829314025512, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15133503079414368, "epoch": 3.79, "learning_rate": 5.964812515840616e-07, "loss": 0.1754, "step": 10500, "task_loss": 0.2718963325023651 }, { "epoch": 3.79, "eval_exact_match": 83.65184484389782, "eval_f1": 90.02195293125705, "step": 10500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01979155875654215, "compression/movement_sparsity/importance_threshold": -0.6158057831714346, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16352495551109314, "epoch": 3.8, "learning_rate": 5.947505768373991e-07, "loss": 0.1724, "step": 10510, "task_loss": 0.27979862689971924 }, { "compression/movement_sparsity/importance_regularization_factor": 0.01987054384362938, "compression/movement_sparsity/importance_threshold": -0.6147298895534556, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15177211165428162, "epoch": 3.8, "learning_rate": 5.930213532036344e-07, "loss": 0.1713, "step": 10520, "task_loss": 0.4249712824821472 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0199494368788803, "compression/movement_sparsity/importance_threshold": -0.6136552498175303, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1794542670249939, "epoch": 3.81, "learning_rate": 5.912935868747525e-07, "loss": 0.1846, "step": 10530, "task_loss": 0.4833950400352478 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02002823791596634, "compression/movement_sparsity/importance_threshold": -0.6125818632325746, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15238790214061737, "epoch": 3.81, "learning_rate": 5.895672840375216e-07, "loss": 0.1683, "step": 10540, "task_loss": 0.34725263714790344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02010694700855888, "compression/movement_sparsity/importance_threshold": -0.6115097290675047, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19352784752845764, "epoch": 3.81, "learning_rate": 5.878424508734687e-07, "loss": 0.1865, "step": 10550, "task_loss": 0.4109495282173157 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02018556421032935, "compression/movement_sparsity/importance_threshold": -0.6104388465912367, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1467682123184204, "epoch": 3.82, "learning_rate": 5.861190935588583e-07, "loss": 0.1751, "step": 10560, "task_loss": 0.39710533618927 }, { "compression/movement_sparsity/importance_regularization_factor": 0.020264089574949143, "compression/movement_sparsity/importance_threshold": -0.6093692150726868, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17885571718215942, "epoch": 3.82, "learning_rate": 5.843972182646706e-07, "loss": 0.1743, "step": 10570, "task_loss": 0.4837023913860321 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0203425231560897, "compression/movement_sparsity/importance_threshold": -0.6083008337807706, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1784561276435852, "epoch": 3.82, "learning_rate": 5.826768311565777e-07, "loss": 0.1766, "step": 10580, "task_loss": 0.4210522174835205 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02042086500742242, "compression/movement_sparsity/importance_threshold": -0.6072337019844045, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14249750971794128, "epoch": 3.83, "learning_rate": 5.809579383949251e-07, "loss": 0.1779, "step": 10590, "task_loss": 0.330593079328537 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0204991151826187, "compression/movement_sparsity/importance_threshold": -0.6061678189525045, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17461369931697845, "epoch": 3.83, "learning_rate": 5.792405461347049e-07, "loss": 0.1777, "step": 10600, "task_loss": 0.40263602137565613 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02057727373534996, "compression/movement_sparsity/importance_threshold": -0.6051031839539867, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18333062529563904, "epoch": 3.83, "learning_rate": 5.775246605255384e-07, "loss": 0.181, "step": 10610, "task_loss": 0.541496992111206 }, { "compression/movement_sparsity/importance_regularization_factor": 0.020655340719287613, "compression/movement_sparsity/importance_threshold": -0.6040397962577672, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18711034953594208, "epoch": 3.84, "learning_rate": 5.758102877116498e-07, "loss": 0.1763, "step": 10620, "task_loss": 0.3023425340652466 }, { "compression/movement_sparsity/importance_regularization_factor": 0.020733316188103066, "compression/movement_sparsity/importance_threshold": -0.6029776551327619, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16456711292266846, "epoch": 3.84, "learning_rate": 5.740974338318476e-07, "loss": 0.1626, "step": 10630, "task_loss": 0.5074876546859741 }, { "compression/movement_sparsity/importance_regularization_factor": 0.020811200195467734, "compression/movement_sparsity/importance_threshold": -0.601916759847887, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14603650569915771, "epoch": 3.85, "learning_rate": 5.723861050195018e-07, "loss": 0.1859, "step": 10640, "task_loss": 0.5426656007766724 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02088899279505301, "compression/movement_sparsity/importance_threshold": -0.6008571096720587, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19825714826583862, "epoch": 3.85, "learning_rate": 5.706763074025198e-07, "loss": 0.1859, "step": 10650, "task_loss": 0.45735934376716614 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02096669404053033, "compression/movement_sparsity/importance_threshold": -0.5997987038741928, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12421312928199768, "epoch": 3.85, "learning_rate": 5.689680471033278e-07, "loss": 0.1666, "step": 10660, "task_loss": 0.292208194732666 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021044303985571094, "compression/movement_sparsity/importance_threshold": -0.5987415417232054, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1720031499862671, "epoch": 3.86, "learning_rate": 5.672613302388456e-07, "loss": 0.1797, "step": 10670, "task_loss": 0.4465997815132141 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021121822683846707, "compression/movement_sparsity/importance_threshold": -0.5976856224880127, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13473817706108093, "epoch": 3.86, "learning_rate": 5.655561629204675e-07, "loss": 0.1779, "step": 10680, "task_loss": 0.5434313416481018 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021199250189028584, "compression/movement_sparsity/importance_threshold": -0.5966309454375308, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1842292696237564, "epoch": 3.86, "learning_rate": 5.638525512540389e-07, "loss": 0.1743, "step": 10690, "task_loss": 0.846764326095581 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021276586554788132, "compression/movement_sparsity/importance_threshold": -0.5955775098406757, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12809939682483673, "epoch": 3.87, "learning_rate": 5.621505013398344e-07, "loss": 0.1692, "step": 10700, "task_loss": 0.359587699174881 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021353831834796756, "compression/movement_sparsity/importance_threshold": -0.5945253149663636, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17134542763233185, "epoch": 3.87, "learning_rate": 5.604500192725374e-07, "loss": 0.184, "step": 10710, "task_loss": 0.6001778841018677 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021430986082725884, "compression/movement_sparsity/importance_threshold": -0.5934743600835102, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1271722912788391, "epoch": 3.87, "learning_rate": 5.587511111412151e-07, "loss": 0.1631, "step": 10720, "task_loss": 0.2941231429576874 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021508049352246924, "compression/movement_sparsity/importance_threshold": -0.5924246444610318, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14123263955116272, "epoch": 3.88, "learning_rate": 5.570537830293006e-07, "loss": 0.1617, "step": 10730, "task_loss": 0.2916907072067261 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02158502169703128, "compression/movement_sparsity/importance_threshold": -0.5913761673678445, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13426288962364197, "epoch": 3.88, "learning_rate": 5.553580410145688e-07, "loss": 0.1659, "step": 10740, "task_loss": 0.5448470115661621 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02166190317075037, "compression/movement_sparsity/importance_threshold": -0.5903289280728642, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13951101899147034, "epoch": 3.89, "learning_rate": 5.536638911691143e-07, "loss": 0.1793, "step": 10750, "task_loss": 0.40928834676742554 }, { "epoch": 3.89, "eval_exact_match": 83.57615894039735, "eval_f1": 89.997904877457, "step": 10750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02173869382707558, "compression/movement_sparsity/importance_threshold": -0.5892829258450072, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1252758651971817, "epoch": 3.89, "learning_rate": 5.519713395593321e-07, "loss": 0.172, "step": 10760, "task_loss": 0.2951388359069824 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021815393719678354, "compression/movement_sparsity/importance_threshold": -0.5882381599531894, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2374802976846695, "epoch": 3.89, "learning_rate": 5.502803922458924e-07, "loss": 0.1794, "step": 10770, "task_loss": 0.4524235725402832 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021892002902230083, "compression/movement_sparsity/importance_threshold": -0.5871946296663269, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19318094849586487, "epoch": 3.9, "learning_rate": 5.485910552837225e-07, "loss": 0.176, "step": 10780, "task_loss": 0.4039768576622009 }, { "compression/movement_sparsity/importance_regularization_factor": 0.021968521428402183, "compression/movement_sparsity/importance_threshold": -0.5861523342533359, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13738399744033813, "epoch": 3.9, "learning_rate": 5.469033347219816e-07, "loss": 0.1636, "step": 10790, "task_loss": 0.5032458901405334 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02204494935186606, "compression/movement_sparsity/importance_threshold": -0.5851112729831323, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17375808954238892, "epoch": 3.9, "learning_rate": 5.452172366040423e-07, "loss": 0.1675, "step": 10800, "task_loss": 0.6967633962631226 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022121286726293127, "compression/movement_sparsity/importance_threshold": -0.5840714451246322, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17048680782318115, "epoch": 3.91, "learning_rate": 5.435327669674672e-07, "loss": 0.1687, "step": 10810, "task_loss": 0.30930009484291077 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022197533605354803, "compression/movement_sparsity/importance_threshold": -0.5830328499467516, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1512829065322876, "epoch": 3.91, "learning_rate": 5.418499318439875e-07, "loss": 0.1747, "step": 10820, "task_loss": 0.35338839888572693 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022273690042722484, "compression/movement_sparsity/importance_threshold": -0.5819954867184067, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.186394602060318, "epoch": 3.91, "learning_rate": 5.401687372594819e-07, "loss": 0.1737, "step": 10830, "task_loss": 0.25927868485450745 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022349756092067595, "compression/movement_sparsity/importance_threshold": -0.5809593547085136, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20156210660934448, "epoch": 3.92, "learning_rate": 5.384891892339539e-07, "loss": 0.1807, "step": 10840, "task_loss": 0.5239515900611877 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022425731807061536, "compression/movement_sparsity/importance_threshold": -0.5799244531859882, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15591740608215332, "epoch": 3.92, "learning_rate": 5.368112937815114e-07, "loss": 0.1818, "step": 10850, "task_loss": 0.2408059537410736 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022501617241375716, "compression/movement_sparsity/importance_threshold": -0.5788907814197467, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.147825688123703, "epoch": 3.92, "learning_rate": 5.351350569103459e-07, "loss": 0.1575, "step": 10860, "task_loss": 0.3448067009449005 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022577412448681538, "compression/movement_sparsity/importance_threshold": -0.5778583386787053, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21096472442150116, "epoch": 3.93, "learning_rate": 5.334604846227077e-07, "loss": 0.1845, "step": 10870, "task_loss": 0.7965984344482422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022653117482650437, "compression/movement_sparsity/importance_threshold": -0.5768271242317797, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18514427542686462, "epoch": 3.93, "learning_rate": 5.317875829148885e-07, "loss": 0.1687, "step": 10880, "task_loss": 0.43882808089256287 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02272873239695382, "compression/movement_sparsity/importance_threshold": -0.5757971373478863, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1756231188774109, "epoch": 3.94, "learning_rate": 5.301163577771966e-07, "loss": 0.1893, "step": 10890, "task_loss": 0.36089468002319336 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02280425724526309, "compression/movement_sparsity/importance_threshold": -0.5747683772959408, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13075178861618042, "epoch": 3.94, "learning_rate": 5.284468151939383e-07, "loss": 0.178, "step": 10900, "task_loss": 0.5520082712173462 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022879692081249646, "compression/movement_sparsity/importance_threshold": -0.5737408433448596, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15874437987804413, "epoch": 3.94, "learning_rate": 5.267789611433934e-07, "loss": 0.1804, "step": 10910, "task_loss": 0.3404083847999573 }, { "compression/movement_sparsity/importance_regularization_factor": 0.022955036958584914, "compression/movement_sparsity/importance_threshold": -0.5727145347635587, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12918061017990112, "epoch": 3.95, "learning_rate": 5.251128015977966e-07, "loss": 0.1761, "step": 10920, "task_loss": 0.1778661012649536 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02303029193094031, "compression/movement_sparsity/importance_threshold": -0.5716894508209539, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17667776346206665, "epoch": 3.95, "learning_rate": 5.234483425233145e-07, "loss": 0.1753, "step": 10930, "task_loss": 0.5118983387947083 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023105457051987223, "compression/movement_sparsity/importance_threshold": -0.5706655907859617, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18898850679397583, "epoch": 3.95, "learning_rate": 5.217855898800249e-07, "loss": 0.1735, "step": 10940, "task_loss": 0.4475729465484619 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023180532375397078, "compression/movement_sparsity/importance_threshold": -0.5696429539274979, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1563543677330017, "epoch": 3.96, "learning_rate": 5.201245496218955e-07, "loss": 0.1829, "step": 10950, "task_loss": 0.5748417377471924 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023255517954841286, "compression/movement_sparsity/importance_threshold": -0.5686215395144786, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1615605354309082, "epoch": 3.96, "learning_rate": 5.18465227696761e-07, "loss": 0.18, "step": 10960, "task_loss": 0.3879707455635071 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02333041384399125, "compression/movement_sparsity/importance_threshold": -0.56760134681582, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16784200072288513, "epoch": 3.96, "learning_rate": 5.168076300463044e-07, "loss": 0.1823, "step": 10970, "task_loss": 0.72287917137146 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02340522009651837, "compression/movement_sparsity/importance_threshold": -0.5665823751004382, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17279008030891418, "epoch": 3.97, "learning_rate": 5.151517626060346e-07, "loss": 0.1768, "step": 10980, "task_loss": 0.4392220973968506 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023479936766094087, "compression/movement_sparsity/importance_threshold": -0.5655646236372489, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16514915227890015, "epoch": 3.97, "learning_rate": 5.134976313052633e-07, "loss": 0.1753, "step": 10990, "task_loss": 0.7647157907485962 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023554563906389804, "compression/movement_sparsity/importance_threshold": -0.5645480916951683, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15675148367881775, "epoch": 3.98, "learning_rate": 5.118452420670876e-07, "loss": 0.1832, "step": 11000, "task_loss": 0.43583187460899353 }, { "epoch": 3.98, "eval_exact_match": 83.6329233680227, "eval_f1": 90.05063824812734, "step": 11000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023629101571076915, "compression/movement_sparsity/importance_threshold": -0.5635327785431127, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14564642310142517, "epoch": 3.98, "learning_rate": 5.101946008083647e-07, "loss": 0.1726, "step": 11010, "task_loss": 0.4629361033439636 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023703549813826847, "compression/movement_sparsity/importance_threshold": -0.5625186834499979, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13881969451904297, "epoch": 3.98, "learning_rate": 5.085457134396945e-07, "loss": 0.1657, "step": 11020, "task_loss": 0.4461815357208252 }, { "compression/movement_sparsity/importance_regularization_factor": 0.023777908688310995, "compression/movement_sparsity/importance_threshold": -0.5615058056847402, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16499027609825134, "epoch": 3.99, "learning_rate": 5.068985858653947e-07, "loss": 0.1757, "step": 11030, "task_loss": 0.237386554479599 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02385217824820078, "compression/movement_sparsity/importance_threshold": -0.5604941445162555, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14505073428153992, "epoch": 3.99, "learning_rate": 5.052532239834831e-07, "loss": 0.1756, "step": 11040, "task_loss": 0.3913651406764984 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02392635854716761, "compression/movement_sparsity/importance_threshold": -0.55948369921346, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16279101371765137, "epoch": 3.99, "learning_rate": 5.036096336856539e-07, "loss": 0.1736, "step": 11050, "task_loss": 0.2173888087272644 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0240004496388829, "compression/movement_sparsity/importance_threshold": -0.5584744690452697, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19913247227668762, "epoch": 4.0, "learning_rate": 5.019678208572585e-07, "loss": 0.1781, "step": 11060, "task_loss": 0.4576454758644104 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024074451577018048, "compression/movement_sparsity/importance_threshold": -0.5574664532806006, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1983916163444519, "epoch": 4.0, "learning_rate": 5.003277913772834e-07, "loss": 0.1729, "step": 11070, "task_loss": 0.3344120383262634 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024148364415244483, "compression/movement_sparsity/importance_threshold": -0.5564596511883687, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13938963413238525, "epoch": 4.0, "learning_rate": 4.986895511183282e-07, "loss": 0.1749, "step": 11080, "task_loss": 0.34267091751098633 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024222188207233597, "compression/movement_sparsity/importance_threshold": -0.5554540620374904, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15878230333328247, "epoch": 4.01, "learning_rate": 4.970531059465865e-07, "loss": 0.1706, "step": 11090, "task_loss": 0.41367635130882263 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024295923006656818, "compression/movement_sparsity/importance_threshold": -0.5544496850968814, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18763568997383118, "epoch": 4.01, "learning_rate": 4.954184617218251e-07, "loss": 0.1847, "step": 11100, "task_loss": 0.38458868861198425 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024369568867185543, "compression/movement_sparsity/importance_threshold": -0.5534465196354581, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1669691503047943, "epoch": 4.02, "learning_rate": 4.937856242973598e-07, "loss": 0.1704, "step": 11110, "task_loss": 0.40393808484077454 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024443125842491187, "compression/movement_sparsity/importance_threshold": -0.5524445649221362, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1612987220287323, "epoch": 4.02, "learning_rate": 4.921545995200387e-07, "loss": 0.1749, "step": 11120, "task_loss": 0.3647257089614868 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024516593986245158, "compression/movement_sparsity/importance_threshold": -0.5514438202258322, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1850115805864334, "epoch": 4.02, "learning_rate": 4.905253932302173e-07, "loss": 0.1773, "step": 11130, "task_loss": 0.6843971014022827 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02458997335211887, "compression/movement_sparsity/importance_threshold": -0.5504442848154618, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19363956153392792, "epoch": 4.03, "learning_rate": 4.88898011261741e-07, "loss": 0.1652, "step": 11140, "task_loss": 0.40217435359954834 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024663263993783737, "compression/movement_sparsity/importance_threshold": -0.5494459579599411, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1595451831817627, "epoch": 4.03, "learning_rate": 4.872724594419225e-07, "loss": 0.1616, "step": 11150, "task_loss": 0.29367250204086304 }, { "compression/movement_sparsity/importance_regularization_factor": 0.024736465964911164, "compression/movement_sparsity/importance_threshold": -0.5484488389281864, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16744457185268402, "epoch": 4.03, "learning_rate": 4.8564874359152e-07, "loss": 0.1756, "step": 11160, "task_loss": 0.3866707682609558 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02480957931917255, "compression/movement_sparsity/importance_threshold": -0.5474529269891137, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1353551745414734, "epoch": 4.04, "learning_rate": 4.840268695247185e-07, "loss": 0.1662, "step": 11170, "task_loss": 0.4497717022895813 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02488260411023933, "compression/movement_sparsity/importance_threshold": -0.5464582214116389, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16634535789489746, "epoch": 4.04, "learning_rate": 4.824068430491079e-07, "loss": 0.1605, "step": 11180, "task_loss": 0.4282223582267761 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02495554039178291, "compression/movement_sparsity/importance_threshold": -0.5454647214646782, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16130493581295013, "epoch": 4.04, "learning_rate": 4.807886699656621e-07, "loss": 0.1793, "step": 11190, "task_loss": 0.6635265350341797 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025028388217474686, "compression/movement_sparsity/importance_threshold": -0.5444724264171475, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16918551921844482, "epoch": 4.05, "learning_rate": 4.791723560687181e-07, "loss": 0.1759, "step": 11200, "task_loss": 0.30166515707969666 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02510114764098608, "compression/movement_sparsity/importance_threshold": -0.5434813355379631, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14573919773101807, "epoch": 4.05, "learning_rate": 4.775579071459558e-07, "loss": 0.1725, "step": 11210, "task_loss": 0.3856244385242462 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0251738187159885, "compression/movement_sparsity/importance_threshold": -0.542491448096041, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14356495440006256, "epoch": 4.05, "learning_rate": 4.759453289783776e-07, "loss": 0.1684, "step": 11220, "task_loss": 0.2242923080921173 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025246401496153356, "compression/movement_sparsity/importance_threshold": -0.5415027633602971, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18279266357421875, "epoch": 4.06, "learning_rate": 4.7433462734028563e-07, "loss": 0.1697, "step": 11230, "task_loss": 0.33016282320022583 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025318896035152062, "compression/movement_sparsity/importance_threshold": -0.5405152805996476, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13953736424446106, "epoch": 4.06, "learning_rate": 4.727258079992643e-07, "loss": 0.1646, "step": 11240, "task_loss": 0.4277157187461853 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02539130238665602, "compression/movement_sparsity/importance_threshold": -0.5395289990830087, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1368265450000763, "epoch": 4.07, "learning_rate": 4.7111887671615635e-07, "loss": 0.1765, "step": 11250, "task_loss": 0.3417168855667114 }, { "epoch": 4.07, "eval_exact_match": 83.65184484389782, "eval_f1": 90.03112152658635, "step": 11250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025463620604336632, "compression/movement_sparsity/importance_threshold": -0.5385439180792964, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13024994730949402, "epoch": 4.07, "learning_rate": 4.6951383924504486e-07, "loss": 0.1765, "step": 11260, "task_loss": 0.21580921113491058 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02553585074186534, "compression/movement_sparsity/importance_threshold": -0.5375600368574266, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15584993362426758, "epoch": 4.07, "learning_rate": 4.679107013332316e-07, "loss": 0.1693, "step": 11270, "task_loss": 0.18834367394447327 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02560799285291353, "compression/movement_sparsity/importance_threshold": -0.5365773546863154, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14209944009780884, "epoch": 4.08, "learning_rate": 4.6630946872121534e-07, "loss": 0.1894, "step": 11280, "task_loss": 0.28304344415664673 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025680046991152623, "compression/movement_sparsity/importance_threshold": -0.5355958708348789, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1544867753982544, "epoch": 4.08, "learning_rate": 4.6471014714267353e-07, "loss": 0.1669, "step": 11290, "task_loss": 0.34882280230522156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.025752013210254023, "compression/movement_sparsity/importance_threshold": -0.5346155845720333, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15384814143180847, "epoch": 4.08, "learning_rate": 4.6311274232443984e-07, "loss": 0.1769, "step": 11300, "task_loss": 0.6888493299484253 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02582389156388915, "compression/movement_sparsity/importance_threshold": -0.5336364951666945, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14584749937057495, "epoch": 4.09, "learning_rate": 4.615172599864855e-07, "loss": 0.1893, "step": 11310, "task_loss": 0.42334309220314026 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0258956821057294, "compression/movement_sparsity/importance_threshold": -0.5326586018877787, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17131412029266357, "epoch": 4.09, "learning_rate": 4.59923705841896e-07, "loss": 0.1795, "step": 11320, "task_loss": 0.6292405128479004 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0259673848894462, "compression/movement_sparsity/importance_threshold": -0.5316819040042018, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1550251841545105, "epoch": 4.09, "learning_rate": 4.5833208559685377e-07, "loss": 0.179, "step": 11330, "task_loss": 0.7897596955299377 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026038999968710952, "compression/movement_sparsity/importance_threshold": -0.5307064007848801, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1730719804763794, "epoch": 4.1, "learning_rate": 4.5674240495061643e-07, "loss": 0.1749, "step": 11340, "task_loss": 0.46178561449050903 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02611052739719506, "compression/movement_sparsity/importance_threshold": -0.5297320914987294, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19655971229076385, "epoch": 4.1, "learning_rate": 4.5515466959549486e-07, "loss": 0.1767, "step": 11350, "task_loss": 0.7543710470199585 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026181967228569945, "compression/movement_sparsity/importance_threshold": -0.528758975414666, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12220363318920135, "epoch": 4.11, "learning_rate": 4.5356888521683613e-07, "loss": 0.1517, "step": 11360, "task_loss": 0.3255242705345154 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02625331951650702, "compression/movement_sparsity/importance_threshold": -0.5277870518016057, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15358072519302368, "epoch": 4.11, "learning_rate": 4.519850574929996e-07, "loss": 0.1836, "step": 11370, "task_loss": 0.28549590706825256 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02632458431467768, "compression/movement_sparsity/importance_threshold": -0.5268163199284649, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16602346301078796, "epoch": 4.11, "learning_rate": 4.504031920953394e-07, "loss": 0.1814, "step": 11380, "task_loss": 0.48657315969467163 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02639576167675335, "compression/movement_sparsity/importance_threshold": -0.5258467790641594, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12628331780433655, "epoch": 4.12, "learning_rate": 4.4882329468818246e-07, "loss": 0.1836, "step": 11390, "task_loss": 0.2578519582748413 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026466851656405437, "compression/movement_sparsity/importance_threshold": -0.5248784284776054, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15595079958438873, "epoch": 4.12, "learning_rate": 4.472453709288091e-07, "loss": 0.1813, "step": 11400, "task_loss": 0.5081670880317688 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026537854307305344, "compression/movement_sparsity/importance_threshold": -0.523911267437719, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16610749065876007, "epoch": 4.12, "learning_rate": 4.4566942646743246e-07, "loss": 0.1716, "step": 11410, "task_loss": 0.43464046716690063 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026608769683124495, "compression/movement_sparsity/importance_threshold": -0.5229452952134162, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17217926681041718, "epoch": 4.13, "learning_rate": 4.4409546694717736e-07, "loss": 0.1618, "step": 11420, "task_loss": 0.38427919149398804 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026679597837534295, "compression/movement_sparsity/importance_threshold": -0.5219805110736129, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14132100343704224, "epoch": 4.13, "learning_rate": 4.425234980040622e-07, "loss": 0.1756, "step": 11430, "task_loss": 0.49500393867492676 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026750338824206146, "compression/movement_sparsity/importance_threshold": -0.5210169142872255, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.180766299366951, "epoch": 4.13, "learning_rate": 4.409535252669763e-07, "loss": 0.1809, "step": 11440, "task_loss": 0.47744685411453247 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026820992696811475, "compression/movement_sparsity/importance_threshold": -0.5200545041231699, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16767911612987518, "epoch": 4.14, "learning_rate": 4.3938555435766187e-07, "loss": 0.172, "step": 11450, "task_loss": 0.4917362332344055 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026891559509021664, "compression/movement_sparsity/importance_threshold": -0.5190932798503622, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16706699132919312, "epoch": 4.14, "learning_rate": 4.378195908906932e-07, "loss": 0.1763, "step": 11460, "task_loss": 0.25731348991394043 }, { "compression/movement_sparsity/importance_regularization_factor": 0.026962039314508152, "compression/movement_sparsity/importance_threshold": -0.5181332407377185, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1750440001487732, "epoch": 4.15, "learning_rate": 4.362556404734552e-07, "loss": 0.1642, "step": 11470, "task_loss": 0.6891375184059143 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02703243216694235, "compression/movement_sparsity/importance_threshold": -0.5171743860541547, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1234075054526329, "epoch": 4.15, "learning_rate": 4.346937087061259e-07, "loss": 0.1645, "step": 11480, "task_loss": 0.4213874936103821 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027102738119995654, "compression/movement_sparsity/importance_threshold": -0.5162167150685868, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1771700531244278, "epoch": 4.15, "learning_rate": 4.3313380118165345e-07, "loss": 0.1784, "step": 11490, "task_loss": 0.4768829345703125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02717295722733948, "compression/movement_sparsity/importance_threshold": -0.5152602270499311, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17060580849647522, "epoch": 4.16, "learning_rate": 4.315759234857388e-07, "loss": 0.1732, "step": 11500, "task_loss": 0.7726600170135498 }, { "epoch": 4.16, "eval_exact_match": 83.6802270577105, "eval_f1": 90.00678546584051, "step": 11500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02724308954264524, "compression/movement_sparsity/importance_threshold": -0.5143049212671038, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19148531556129456, "epoch": 4.16, "learning_rate": 4.300200811968141e-07, "loss": 0.1736, "step": 11510, "task_loss": 0.3507334887981415 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02731313511958434, "compression/movement_sparsity/importance_threshold": -0.5133507969890206, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1210765391588211, "epoch": 4.16, "learning_rate": 4.284662798860232e-07, "loss": 0.1682, "step": 11520, "task_loss": 0.20562471449375153 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027383094011828195, "compression/movement_sparsity/importance_threshold": -0.5123978534845979, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14142274856567383, "epoch": 4.17, "learning_rate": 4.2691452511720194e-07, "loss": 0.1691, "step": 11530, "task_loss": 0.3383503556251526 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027452966273048215, "compression/movement_sparsity/importance_threshold": -0.5114460900227514, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1510867178440094, "epoch": 4.17, "learning_rate": 4.253648224468567e-07, "loss": 0.1818, "step": 11540, "task_loss": 0.7619709968566895 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027522751956915795, "compression/movement_sparsity/importance_threshold": -0.5104955058723977, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17000426352024078, "epoch": 4.17, "learning_rate": 4.238171774241471e-07, "loss": 0.1604, "step": 11550, "task_loss": 0.3821756839752197 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027592451117102377, "compression/movement_sparsity/importance_threshold": -0.5095461003024523, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12718622386455536, "epoch": 4.18, "learning_rate": 4.2227159559086466e-07, "loss": 0.1767, "step": 11560, "task_loss": 0.3057350814342499 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02766206380727935, "compression/movement_sparsity/importance_threshold": -0.5085978725818315, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13876132667064667, "epoch": 4.18, "learning_rate": 4.207280824814119e-07, "loss": 0.1755, "step": 11570, "task_loss": 0.48780569434165955 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027731590081118137, "compression/movement_sparsity/importance_threshold": -0.5076508219794513, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15152356028556824, "epoch": 4.19, "learning_rate": 4.191866436227851e-07, "loss": 0.1602, "step": 11580, "task_loss": 0.29727867245674133 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027801029992290137, "compression/movement_sparsity/importance_threshold": -0.506704947764228, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17890770733356476, "epoch": 4.19, "learning_rate": 4.1764728453455167e-07, "loss": 0.1692, "step": 11590, "task_loss": 0.46190762519836426 }, { "compression/movement_sparsity/importance_regularization_factor": 0.027870383594466763, "compression/movement_sparsity/importance_threshold": -0.5057602492050775, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16518770158290863, "epoch": 4.19, "learning_rate": 4.1611001072883323e-07, "loss": 0.1793, "step": 11600, "task_loss": 0.6467550992965698 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02793965094131943, "compression/movement_sparsity/importance_threshold": -0.5048167255709157, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12185207009315491, "epoch": 4.2, "learning_rate": 4.1457482771028305e-07, "loss": 0.1653, "step": 11610, "task_loss": 0.3818345069885254 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02800883208651955, "compression/movement_sparsity/importance_threshold": -0.5038743761306589, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1789764165878296, "epoch": 4.2, "learning_rate": 4.130417409760689e-07, "loss": 0.1788, "step": 11620, "task_loss": 0.6418240666389465 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02807792708373852, "compression/movement_sparsity/importance_threshold": -0.5029332001532232, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1454584151506424, "epoch": 4.2, "learning_rate": 4.1151075601585174e-07, "loss": 0.1828, "step": 11630, "task_loss": 0.43138134479522705 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02814693598664776, "compression/movement_sparsity/importance_threshold": -0.5019931969075245, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16067031025886536, "epoch": 4.21, "learning_rate": 4.0998187831176636e-07, "loss": 0.1775, "step": 11640, "task_loss": 0.32413339614868164 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02821585884891869, "compression/movement_sparsity/importance_threshold": -0.5010543656624789, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17555850744247437, "epoch": 4.21, "learning_rate": 4.084551133384024e-07, "loss": 0.1691, "step": 11650, "task_loss": 0.5207797884941101 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028284695724222705, "compression/movement_sparsity/importance_threshold": -0.5001167056870026, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12386974692344666, "epoch": 4.21, "learning_rate": 4.069304665627834e-07, "loss": 0.1682, "step": 11660, "task_loss": 0.26369139552116394 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028353446666231224, "compression/movement_sparsity/importance_threshold": -0.4991802162500115, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15394540131092072, "epoch": 4.22, "learning_rate": 4.054079434443487e-07, "loss": 0.1543, "step": 11670, "task_loss": 0.4621257781982422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028422111728615654, "compression/movement_sparsity/importance_threshold": -0.4982448966204218, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15610939264297485, "epoch": 4.22, "learning_rate": 4.0388754943493374e-07, "loss": 0.174, "step": 11680, "task_loss": 0.7012438774108887 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02849069096504741, "compression/movement_sparsity/importance_threshold": -0.4973107460671495, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1879960000514984, "epoch": 4.22, "learning_rate": 4.023692899787486e-07, "loss": 0.181, "step": 11690, "task_loss": 0.5456829071044922 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028559184429197897, "compression/movement_sparsity/importance_threshold": -0.49637776385911064, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16950491070747375, "epoch": 4.23, "learning_rate": 4.0085317051236176e-07, "loss": 0.1782, "step": 11700, "task_loss": 0.5510709881782532 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028627592174738523, "compression/movement_sparsity/importance_threshold": -0.4954459492652214, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14448924362659454, "epoch": 4.23, "learning_rate": 3.9933919646467716e-07, "loss": 0.1733, "step": 11710, "task_loss": 0.4822184443473816 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028695914255340713, "compression/movement_sparsity/importance_threshold": -0.49451530155439766, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1638861894607544, "epoch": 4.24, "learning_rate": 3.9782737325691786e-07, "loss": 0.1785, "step": 11720, "task_loss": 0.3740285038948059 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02876415072467586, "compression/movement_sparsity/importance_threshold": -0.49358581999555573, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12357451766729355, "epoch": 4.24, "learning_rate": 3.96317706302604e-07, "loss": 0.1757, "step": 11730, "task_loss": 0.7518565654754639 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028832301636415394, "compression/movement_sparsity/importance_threshold": -0.49265750385761137, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11722180247306824, "epoch": 4.24, "learning_rate": 3.948102010075356e-07, "loss": 0.1569, "step": 11740, "task_loss": 0.4631866216659546 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02890036704423069, "compression/movement_sparsity/importance_threshold": -0.49173035240948115, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.23655325174331665, "epoch": 4.25, "learning_rate": 3.933048627697717e-07, "loss": 0.1875, "step": 11750, "task_loss": 0.260358989238739 }, { "epoch": 4.25, "eval_exact_match": 83.80321665089878, "eval_f1": 90.13903938703879, "step": 11750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.028968347001793203, "compression/movement_sparsity/importance_threshold": -0.49080436492008056, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21262499690055847, "epoch": 4.25, "learning_rate": 3.9180169697961183e-07, "loss": 0.1815, "step": 11760, "task_loss": 0.3771222233772278 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02903624156277432, "compression/movement_sparsity/importance_threshold": -0.4898795406583259, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14260724186897278, "epoch": 4.25, "learning_rate": 3.903007090195768e-07, "loss": 0.1721, "step": 11770, "task_loss": 0.404729425907135 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029104050780845457, "compression/movement_sparsity/importance_threshold": -0.4889558788931333, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18275782465934753, "epoch": 4.26, "learning_rate": 3.8880190426438764e-07, "loss": 0.1657, "step": 11780, "task_loss": 0.2863193154335022 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029171774709678022, "compression/movement_sparsity/importance_threshold": -0.4880333788934188, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16939735412597656, "epoch": 4.26, "learning_rate": 3.873052880809493e-07, "loss": 0.1796, "step": 11790, "task_loss": 0.33054405450820923 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02923941340294343, "compression/movement_sparsity/importance_threshold": -0.4871120399280983, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18896028399467468, "epoch": 4.26, "learning_rate": 3.8581086582832967e-07, "loss": 0.1792, "step": 11800, "task_loss": 0.5677767992019653 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029306966914313084, "compression/movement_sparsity/importance_threshold": -0.48619186126608815, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1795285940170288, "epoch": 4.27, "learning_rate": 3.8431864285773964e-07, "loss": 0.1712, "step": 11810, "task_loss": 0.7608554363250732 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02937443529745841, "compression/movement_sparsity/importance_threshold": -0.4852728421763042, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1296456754207611, "epoch": 4.27, "learning_rate": 3.828286245125163e-07, "loss": 0.1655, "step": 11820, "task_loss": 0.385231614112854 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029441818606050794, "compression/movement_sparsity/importance_threshold": -0.48435498192766263, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16423696279525757, "epoch": 4.28, "learning_rate": 3.8134081612810097e-07, "loss": 0.167, "step": 11830, "task_loss": 0.2643640637397766 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02950911689376165, "compression/movement_sparsity/importance_threshold": -0.48343827978907966, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1867581307888031, "epoch": 4.28, "learning_rate": 3.7985522303202277e-07, "loss": 0.1719, "step": 11840, "task_loss": 0.4425698518753052 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02957633021426241, "compression/movement_sparsity/importance_threshold": -0.482522735029471, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13380573689937592, "epoch": 4.28, "learning_rate": 3.7837185054387833e-07, "loss": 0.1659, "step": 11850, "task_loss": 0.28523489832878113 }, { "compression/movement_sparsity/importance_regularization_factor": 0.02964345862122448, "compression/movement_sparsity/importance_threshold": -0.48160834691775284, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1440255045890808, "epoch": 4.29, "learning_rate": 3.7689070397531163e-07, "loss": 0.1602, "step": 11860, "task_loss": 0.47079092264175415 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029710502168319258, "compression/movement_sparsity/importance_threshold": -0.4806951147228413, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17543549835681915, "epoch": 4.29, "learning_rate": 3.7541178862999714e-07, "loss": 0.1781, "step": 11870, "task_loss": 0.6562625169754028 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029777460909218165, "compression/movement_sparsity/importance_threshold": -0.47978303771365244, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13487908244132996, "epoch": 4.29, "learning_rate": 3.739351098036195e-07, "loss": 0.1778, "step": 11880, "task_loss": 0.23533663153648376 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0298443348975926, "compression/movement_sparsity/importance_threshold": -0.4788721151591024, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15928372740745544, "epoch": 4.3, "learning_rate": 3.724606727838551e-07, "loss": 0.1712, "step": 11890, "task_loss": 0.4500586986541748 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029911124187113988, "compression/movement_sparsity/importance_threshold": -0.477962346328107, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16884973645210266, "epoch": 4.3, "learning_rate": 3.70988482850352e-07, "loss": 0.1865, "step": 11900, "task_loss": 0.22457614541053772 }, { "compression/movement_sparsity/importance_regularization_factor": 0.029977828831453723, "compression/movement_sparsity/importance_threshold": -0.4770537304895827, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1822618544101715, "epoch": 4.3, "learning_rate": 3.695185452747127e-07, "loss": 0.1776, "step": 11910, "task_loss": 0.32499706745147705 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030044448884283235, "compression/movement_sparsity/importance_threshold": -0.4761462669124452, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15013720095157623, "epoch": 4.31, "learning_rate": 3.680508653204748e-07, "loss": 0.1711, "step": 11920, "task_loss": 0.37257882952690125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03011098439927392, "compression/movement_sparsity/importance_threshold": -0.47523995486561077, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13815215229988098, "epoch": 4.31, "learning_rate": 3.665854482430907e-07, "loss": 0.1778, "step": 11930, "task_loss": 0.34098368883132935 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030177435430097186, "compression/movement_sparsity/importance_threshold": -0.47433479361799546, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13309718668460846, "epoch": 4.32, "learning_rate": 3.65122299289911e-07, "loss": 0.162, "step": 11940, "task_loss": 0.17212031781673431 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03024380203042446, "compression/movement_sparsity/importance_threshold": -0.4734307824385152, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17650893330574036, "epoch": 4.32, "learning_rate": 3.636614237001637e-07, "loss": 0.1833, "step": 11950, "task_loss": 0.4818665087223053 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03031008425392714, "compression/movement_sparsity/importance_threshold": -0.47252792059608617, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1562044322490692, "epoch": 4.32, "learning_rate": 3.6220282670493706e-07, "loss": 0.1765, "step": 11960, "task_loss": 0.4606776535511017 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03037628215427664, "compression/movement_sparsity/importance_threshold": -0.47162620735962446, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16945087909698486, "epoch": 4.33, "learning_rate": 3.607465135271603e-07, "loss": 0.1905, "step": 11970, "task_loss": 0.40968573093414307 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03044239578514437, "compression/movement_sparsity/importance_threshold": -0.4707256419980461, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17481261491775513, "epoch": 4.33, "learning_rate": 3.5929248938158396e-07, "loss": 0.1733, "step": 11980, "task_loss": 0.25828778743743896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030508425200201736, "compression/movement_sparsity/importance_threshold": -0.46982622378026717, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21180686354637146, "epoch": 4.33, "learning_rate": 3.578407594747624e-07, "loss": 0.1729, "step": 11990, "task_loss": 0.4141823351383209 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03057437045312016, "compression/movement_sparsity/importance_threshold": -0.46892795197520365, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19437111914157867, "epoch": 4.34, "learning_rate": 3.5639132900503533e-07, "loss": 0.1742, "step": 12000, "task_loss": 0.39397311210632324 }, { "epoch": 4.34, "eval_exact_match": 83.56669820245979, "eval_f1": 90.00705474053464, "step": 12000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030640231597571045, "compression/movement_sparsity/importance_threshold": -0.4680308258517717, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15808720886707306, "epoch": 4.34, "learning_rate": 3.549442031625084e-07, "loss": 0.1823, "step": 12010, "task_loss": 0.31429338455200195 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030706008687225797, "compression/movement_sparsity/importance_threshold": -0.4671348446788875, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13028618693351746, "epoch": 4.34, "learning_rate": 3.534993871290338e-07, "loss": 0.1702, "step": 12020, "task_loss": 0.35157257318496704 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030771701775755823, "compression/movement_sparsity/importance_threshold": -0.46624000772546703, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1625824123620987, "epoch": 4.35, "learning_rate": 3.520568860781944e-07, "loss": 0.1868, "step": 12030, "task_loss": 0.38068217039108276 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030837310916832554, "compression/movement_sparsity/importance_threshold": -0.46534631426042616, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1509367823600769, "epoch": 4.35, "learning_rate": 3.5061670517528294e-07, "loss": 0.1763, "step": 12040, "task_loss": 0.3321918249130249 }, { "compression/movement_sparsity/importance_regularization_factor": 0.030902836164127395, "compression/movement_sparsity/importance_threshold": -0.46445376355268103, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16750577092170715, "epoch": 4.35, "learning_rate": 3.491788495772836e-07, "loss": 0.1792, "step": 12050, "task_loss": 0.8832313418388367 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03096827757131175, "compression/movement_sparsity/importance_threshold": -0.46356235487114783, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15269014239311218, "epoch": 4.36, "learning_rate": 3.477433244328553e-07, "loss": 0.1686, "step": 12060, "task_loss": 0.37979966402053833 }, { "compression/movement_sparsity/importance_regularization_factor": 0.031033635192057027, "compression/movement_sparsity/importance_threshold": -0.46267208748474264, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16275674104690552, "epoch": 4.36, "learning_rate": 3.4631013488231075e-07, "loss": 0.1806, "step": 12070, "task_loss": 0.38607725501060486 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03109890908003464, "compression/movement_sparsity/importance_threshold": -0.46178296066238145, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.24525508284568787, "epoch": 4.37, "learning_rate": 3.448792860576004e-07, "loss": 0.1838, "step": 12080, "task_loss": 0.5830104947090149 }, { "compression/movement_sparsity/importance_regularization_factor": 0.031164099288916003, "compression/movement_sparsity/importance_threshold": -0.4608949736729803, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15277394652366638, "epoch": 4.37, "learning_rate": 3.434507830822934e-07, "loss": 0.1763, "step": 12090, "task_loss": 0.2695538401603699 }, { "compression/movement_sparsity/importance_regularization_factor": 0.031229205872372517, "compression/movement_sparsity/importance_threshold": -0.46000812578545536, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16866612434387207, "epoch": 4.37, "learning_rate": 3.420246310715572e-07, "loss": 0.1634, "step": 12100, "task_loss": 0.5227435827255249 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03129422888407559, "compression/movement_sparsity/importance_threshold": -0.4591224162687228, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18312391638755798, "epoch": 4.38, "learning_rate": 3.4060083513214257e-07, "loss": 0.1721, "step": 12110, "task_loss": 0.5585439801216125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03135916837769666, "compression/movement_sparsity/importance_threshold": -0.4582378443916983, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15855933725833893, "epoch": 4.38, "learning_rate": 3.39179400362363e-07, "loss": 0.1687, "step": 12120, "task_loss": 0.47586819529533386 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03142402440690711, "compression/movement_sparsity/importance_threshold": -0.45735440942329825, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17082326114177704, "epoch": 4.38, "learning_rate": 3.377603318520779e-07, "loss": 0.1633, "step": 12130, "task_loss": 0.49885010719299316 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03148879702537836, "compression/movement_sparsity/importance_threshold": -0.45647211063243853, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1388573944568634, "epoch": 4.39, "learning_rate": 3.3634363468267177e-07, "loss": 0.1781, "step": 12140, "task_loss": 0.5175285339355469 }, { "compression/movement_sparsity/importance_regularization_factor": 0.031553486286781826, "compression/movement_sparsity/importance_threshold": -0.4555909472880353, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1499364674091339, "epoch": 4.39, "learning_rate": 3.349293139270398e-07, "loss": 0.1753, "step": 12150, "task_loss": 0.13086864352226257 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03161809224478891, "compression/movement_sparsity/importance_threshold": -0.45471091865900465, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2038741409778595, "epoch": 4.39, "learning_rate": 3.335173746495672e-07, "loss": 0.1774, "step": 12160, "task_loss": 0.616294801235199 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03168261495307103, "compression/movement_sparsity/importance_threshold": -0.4538320240142626, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16956669092178345, "epoch": 4.4, "learning_rate": 3.3210782190611054e-07, "loss": 0.1763, "step": 12170, "task_loss": 0.5146920680999756 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03174705446529958, "compression/movement_sparsity/importance_threshold": -0.45295426262272526, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1898520290851593, "epoch": 4.4, "learning_rate": 3.3070066074398226e-07, "loss": 0.1798, "step": 12180, "task_loss": 0.5947543382644653 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03181141083514599, "compression/movement_sparsity/importance_threshold": -0.45207763375330867, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13102659583091736, "epoch": 4.41, "learning_rate": 3.2929589620192975e-07, "loss": 0.1581, "step": 12190, "task_loss": 0.2658257484436035 }, { "compression/movement_sparsity/importance_regularization_factor": 0.031875684116281666, "compression/movement_sparsity/importance_threshold": -0.45120213667492887, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18428611755371094, "epoch": 4.41, "learning_rate": 3.278935333101196e-07, "loss": 0.1835, "step": 12200, "task_loss": 0.45930129289627075 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03193987436237801, "compression/movement_sparsity/importance_threshold": -0.45032777065650204, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15300177037715912, "epoch": 4.41, "learning_rate": 3.264935770901183e-07, "loss": 0.183, "step": 12210, "task_loss": 0.296694278717041 }, { "compression/movement_sparsity/importance_regularization_factor": 0.032003981627106444, "compression/movement_sparsity/importance_threshold": -0.4494545349669441, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17390677332878113, "epoch": 4.42, "learning_rate": 3.2509603255487394e-07, "loss": 0.178, "step": 12220, "task_loss": 0.5502451062202454 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03206800596413837, "compression/movement_sparsity/importance_threshold": -0.44858242887517114, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18281325697898865, "epoch": 4.42, "learning_rate": 3.237009047086997e-07, "loss": 0.169, "step": 12230, "task_loss": 0.4404161274433136 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0321319474271452, "compression/movement_sparsity/importance_threshold": -0.4477114516500993, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1602400243282318, "epoch": 4.42, "learning_rate": 3.2230819854725465e-07, "loss": 0.1723, "step": 12240, "task_loss": 0.5139293670654297 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03219580606979835, "compression/movement_sparsity/importance_threshold": -0.44684160256064465, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15721780061721802, "epoch": 4.43, "learning_rate": 3.2091791905752673e-07, "loss": 0.1617, "step": 12250, "task_loss": 0.4229050874710083 }, { "epoch": 4.43, "eval_exact_match": 83.62346263008514, "eval_f1": 89.97990512757178, "step": 12250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03225958194576923, "compression/movement_sparsity/importance_threshold": -0.4459728808757232, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15479019284248352, "epoch": 4.43, "learning_rate": 3.1953007121781425e-07, "loss": 0.1746, "step": 12260, "task_loss": 0.5893458127975464 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03232327510872924, "compression/movement_sparsity/importance_threshold": -0.445105285864251, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15340088307857513, "epoch": 4.43, "learning_rate": 3.181446599977078e-07, "loss": 0.1625, "step": 12270, "task_loss": 0.6467044353485107 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0323868856123498, "compression/movement_sparsity/importance_threshold": -0.4442388167951442, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1622926890850067, "epoch": 4.44, "learning_rate": 3.167616903580738e-07, "loss": 0.1754, "step": 12280, "task_loss": 0.4219571352005005 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03245041351030232, "compression/movement_sparsity/importance_threshold": -0.4433734729373189, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16757610440254211, "epoch": 4.44, "learning_rate": 3.1538116725103506e-07, "loss": 0.1793, "step": 12290, "task_loss": 0.3955652713775635 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0325138588562582, "compression/movement_sparsity/importance_threshold": -0.442509253559691, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19136472046375275, "epoch": 4.45, "learning_rate": 3.1400309561995473e-07, "loss": 0.1777, "step": 12300, "task_loss": 0.5077202320098877 }, { "compression/movement_sparsity/importance_regularization_factor": 0.032577221703888874, "compression/movement_sparsity/importance_threshold": -0.4416461579311766, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17287388443946838, "epoch": 4.45, "learning_rate": 3.126274803994169e-07, "loss": 0.1769, "step": 12310, "task_loss": 0.2892530560493469 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03264050210686572, "compression/movement_sparsity/importance_threshold": -0.440784185320692, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1708020567893982, "epoch": 4.45, "learning_rate": 3.1125432651521034e-07, "loss": 0.178, "step": 12320, "task_loss": 0.5046877264976501 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03270370011886018, "compression/movement_sparsity/importance_threshold": -0.439923334997153, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17857205867767334, "epoch": 4.46, "learning_rate": 3.098836388843105e-07, "loss": 0.1756, "step": 12330, "task_loss": 0.47654569149017334 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03276681579354365, "compression/movement_sparsity/importance_threshold": -0.43906360622947577, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15070676803588867, "epoch": 4.46, "learning_rate": 3.085154224148605e-07, "loss": 0.1623, "step": 12340, "task_loss": 0.6382228136062622 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03282984918458754, "compression/movement_sparsity/importance_threshold": -0.43820499828657633, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14683285355567932, "epoch": 4.46, "learning_rate": 3.071496820061561e-07, "loss": 0.1704, "step": 12350, "task_loss": 0.3013722598552704 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03289280034566327, "compression/movement_sparsity/importance_threshold": -0.43734751043737075, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11898720264434814, "epoch": 4.47, "learning_rate": 3.057864225486262e-07, "loss": 0.1761, "step": 12360, "task_loss": 0.18860679864883423 }, { "compression/movement_sparsity/importance_regularization_factor": 0.032955669330442236, "compression/movement_sparsity/importance_threshold": -0.4364911419507752, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17226265370845795, "epoch": 4.47, "learning_rate": 3.044256489238159e-07, "loss": 0.1697, "step": 12370, "task_loss": 0.2961030602455139 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03301845619259586, "compression/movement_sparsity/importance_threshold": -0.43563589209570563, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1485224962234497, "epoch": 4.47, "learning_rate": 3.030673660043698e-07, "loss": 0.1871, "step": 12380, "task_loss": 0.31120193004608154 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03308116098579554, "compression/movement_sparsity/importance_threshold": -0.43478176014107817, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1510060578584671, "epoch": 4.48, "learning_rate": 3.018470449500072e-07, "loss": 0.178, "step": 12390, "task_loss": 0.49028700590133667 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0331437837637127, "compression/movement_sparsity/importance_threshold": -0.433928745355809, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16419251263141632, "epoch": 4.48, "learning_rate": 3.004935077629299e-07, "loss": 0.1782, "step": 12400, "task_loss": 0.3531630039215088 }, { "compression/movement_sparsity/importance_regularization_factor": 0.033206324580018744, "compression/movement_sparsity/importance_threshold": -0.43307684700881394, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18890751898288727, "epoch": 4.49, "learning_rate": 2.991424753613858e-07, "loss": 0.1893, "step": 12410, "task_loss": 0.4802757799625397 }, { "compression/movement_sparsity/importance_regularization_factor": 0.033268783488385086, "compression/movement_sparsity/importance_threshold": -0.4322260643690091, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14421889185905457, "epoch": 4.49, "learning_rate": 2.97793952583138e-07, "loss": 0.163, "step": 12420, "task_loss": 0.6870753765106201 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03333116054248314, "compression/movement_sparsity/importance_threshold": -0.4313763967053107, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14144985377788544, "epoch": 4.49, "learning_rate": 2.9644794425696316e-07, "loss": 0.166, "step": 12430, "task_loss": 0.2926397919654846 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0333934557959843, "compression/movement_sparsity/importance_threshold": -0.43052784328663474, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14997738599777222, "epoch": 4.5, "learning_rate": 2.9510445520263315e-07, "loss": 0.1834, "step": 12440, "task_loss": 0.2823546528816223 }, { "compression/movement_sparsity/importance_regularization_factor": 0.033455669302560005, "compression/movement_sparsity/importance_threshold": -0.4296804033818971, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1645713895559311, "epoch": 4.5, "learning_rate": 2.937634902309001e-07, "loss": 0.1829, "step": 12450, "task_loss": 0.41032591462135315 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03351780111588164, "compression/movement_sparsity/importance_threshold": -0.42883407626001413, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14776745438575745, "epoch": 4.5, "learning_rate": 2.9242505414347683e-07, "loss": 0.1743, "step": 12460, "task_loss": 0.6065025925636292 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03357985128962062, "compression/movement_sparsity/importance_threshold": -0.4279888611899018, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17194849252700806, "epoch": 4.51, "learning_rate": 2.910891517330215e-07, "loss": 0.1771, "step": 12470, "task_loss": 0.49688899517059326 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03364181987744837, "compression/movement_sparsity/importance_threshold": -0.42714475744047614, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17045895755290985, "epoch": 4.51, "learning_rate": 2.897557877831196e-07, "loss": 0.1693, "step": 12480, "task_loss": 0.3933444917201996 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03370370693303629, "compression/movement_sparsity/importance_threshold": -0.42630176428065325, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1370893120765686, "epoch": 4.51, "learning_rate": 2.884249670682668e-07, "loss": 0.1849, "step": 12490, "task_loss": 0.28957247734069824 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03376551251005577, "compression/movement_sparsity/importance_threshold": -0.4254598809793493, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16283123195171356, "epoch": 4.52, "learning_rate": 2.870966943538522e-07, "loss": 0.167, "step": 12500, "task_loss": 0.2896598279476166 }, { "epoch": 4.52, "eval_exact_match": 83.43424787133397, "eval_f1": 89.91171539061436, "step": 12500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03382723666217826, "compression/movement_sparsity/importance_threshold": -0.42461910680548015, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13904690742492676, "epoch": 4.52, "learning_rate": 2.857709743961404e-07, "loss": 0.1662, "step": 12510, "task_loss": 0.2602207660675049 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03388887944307515, "compression/movement_sparsity/importance_threshold": -0.4237794410279619, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15818578004837036, "epoch": 4.52, "learning_rate": 2.84447811942256e-07, "loss": 0.1918, "step": 12520, "task_loss": 0.31633198261260986 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03395044090641785, "compression/movement_sparsity/importance_threshold": -0.42294088291571075, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17294219136238098, "epoch": 4.53, "learning_rate": 2.8312721173016476e-07, "loss": 0.1681, "step": 12530, "task_loss": 0.3709946870803833 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03401192110587777, "compression/movement_sparsity/importance_threshold": -0.4221034317376427, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12054181098937988, "epoch": 4.53, "learning_rate": 2.818091784886585e-07, "loss": 0.168, "step": 12540, "task_loss": 0.2598145604133606 }, { "compression/movement_sparsity/importance_regularization_factor": 0.034073320095126335, "compression/movement_sparsity/importance_threshold": -0.42126708676267377, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15481901168823242, "epoch": 4.54, "learning_rate": 2.80493716937337e-07, "loss": 0.1767, "step": 12550, "task_loss": 0.38484764099121094 }, { "compression/movement_sparsity/importance_regularization_factor": 0.034134637927834936, "compression/movement_sparsity/importance_threshold": -0.42043184725972005, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1437249779701233, "epoch": 4.54, "learning_rate": 2.791808317865907e-07, "loss": 0.1773, "step": 12560, "task_loss": 0.3040648102760315 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03419587465767499, "compression/movement_sparsity/importance_threshold": -0.41959771249769773, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18909630179405212, "epoch": 4.54, "learning_rate": 2.778705277375857e-07, "loss": 0.1829, "step": 12570, "task_loss": 0.3867703080177307 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03425703033831792, "compression/movement_sparsity/importance_threshold": -0.41876468174552267, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1678750216960907, "epoch": 4.55, "learning_rate": 2.765628094822443e-07, "loss": 0.1736, "step": 12580, "task_loss": 0.4944628179073334 }, { "compression/movement_sparsity/importance_regularization_factor": 0.034318105023435116, "compression/movement_sparsity/importance_threshold": -0.41793275427211113, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16761477291584015, "epoch": 4.55, "learning_rate": 2.7525768170323084e-07, "loss": 0.1873, "step": 12590, "task_loss": 0.2373383790254593 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03437909876669801, "compression/movement_sparsity/importance_threshold": -0.4171019293463789, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15242645144462585, "epoch": 4.55, "learning_rate": 2.7395514907393304e-07, "loss": 0.1712, "step": 12600, "task_loss": 0.2393788993358612 }, { "compression/movement_sparsity/importance_regularization_factor": 0.034440011621777976, "compression/movement_sparsity/importance_threshold": -0.4162722062372427, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19179469347000122, "epoch": 4.56, "learning_rate": 2.7265521625844623e-07, "loss": 0.1749, "step": 12610, "task_loss": 0.3033488392829895 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03450084364234648, "compression/movement_sparsity/importance_threshold": -0.41544358421361766, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17567074298858643, "epoch": 4.56, "learning_rate": 2.7135788791155645e-07, "loss": 0.1815, "step": 12620, "task_loss": 0.608439564704895 }, { "compression/movement_sparsity/importance_regularization_factor": 0.034561594882074886, "compression/movement_sparsity/importance_threshold": -0.41461606254442046, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13859786093235016, "epoch": 4.56, "learning_rate": 2.7006316867872303e-07, "loss": 0.1801, "step": 12630, "task_loss": 0.2615768313407898 }, { "compression/movement_sparsity/importance_regularization_factor": 0.034622265394634616, "compression/movement_sparsity/importance_threshold": -0.41378964049856715, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1449252963066101, "epoch": 4.57, "learning_rate": 2.6877106319606344e-07, "loss": 0.1744, "step": 12640, "task_loss": 0.35204145312309265 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0346828552336971, "compression/movement_sparsity/importance_threshold": -0.41296431734497346, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17655514180660248, "epoch": 4.57, "learning_rate": 2.6748157609033507e-07, "loss": 0.1744, "step": 12650, "task_loss": 0.3563224971294403 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03474336445293374, "compression/movement_sparsity/importance_threshold": -0.4121400923525557, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19780133664608002, "epoch": 4.58, "learning_rate": 2.661947119789202e-07, "loss": 0.1878, "step": 12660, "task_loss": 0.5263267755508423 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03480379310601593, "compression/movement_sparsity/importance_threshold": -0.41131696479023, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1600715070962906, "epoch": 4.58, "learning_rate": 2.649104754698085e-07, "loss": 0.1757, "step": 12670, "task_loss": 0.3594798445701599 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03486414124661509, "compression/movement_sparsity/importance_threshold": -0.4104949339269123, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18169142305850983, "epoch": 4.58, "learning_rate": 2.636288711615801e-07, "loss": 0.1801, "step": 12680, "task_loss": 0.2691894769668579 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03492440892840263, "compression/movement_sparsity/importance_threshold": -0.4096739990315188, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13344717025756836, "epoch": 4.59, "learning_rate": 2.623499036433909e-07, "loss": 0.1725, "step": 12690, "task_loss": 0.42940986156463623 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03498459620504996, "compression/movement_sparsity/importance_threshold": -0.4088541593729654, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1338200718164444, "epoch": 4.59, "learning_rate": 2.6107357749495396e-07, "loss": 0.1853, "step": 12700, "task_loss": 0.0940115749835968 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035044703130228505, "compression/movement_sparsity/importance_threshold": -0.4080354142201682, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17829687893390656, "epoch": 4.59, "learning_rate": 2.5979989728652486e-07, "loss": 0.1591, "step": 12710, "task_loss": 0.4869588017463684 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03510472975760966, "compression/movement_sparsity/importance_threshold": -0.4072177628420433, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19079387187957764, "epoch": 4.6, "learning_rate": 2.5852886757888417e-07, "loss": 0.1801, "step": 12720, "task_loss": 0.3848039507865906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035164676140864846, "compression/movement_sparsity/importance_threshold": -0.40640120450750666, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16148385405540466, "epoch": 4.6, "learning_rate": 2.57260492923322e-07, "loss": 0.1754, "step": 12730, "task_loss": 0.41788214445114136 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035224542333665454, "compression/movement_sparsity/importance_threshold": -0.40558573848547463, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18705248832702637, "epoch": 4.6, "learning_rate": 2.5599477786162115e-07, "loss": 0.1689, "step": 12740, "task_loss": 0.4458879232406616 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035284328389682915, "compression/movement_sparsity/importance_threshold": -0.404771364044863, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17292597889900208, "epoch": 4.61, "learning_rate": 2.547317269260405e-07, "loss": 0.1806, "step": 12750, "task_loss": 0.3828756511211395 }, { "epoch": 4.61, "eval_exact_match": 83.59508041627247, "eval_f1": 89.98942260118159, "step": 12750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03534403436258863, "compression/movement_sparsity/importance_threshold": -0.403958080454588, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12280981242656708, "epoch": 4.61, "learning_rate": 2.534713446393002e-07, "loss": 0.1769, "step": 12760, "task_loss": 0.34313008189201355 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035403660306054026, "compression/movement_sparsity/importance_threshold": -0.40314588698356546, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1836218684911728, "epoch": 4.62, "learning_rate": 2.522136355145632e-07, "loss": 0.1796, "step": 12770, "task_loss": 0.5304206013679504 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035463206273750485, "compression/movement_sparsity/importance_threshold": -0.40233478290071184, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1639278084039688, "epoch": 4.62, "learning_rate": 2.5095860405542167e-07, "loss": 0.1805, "step": 12780, "task_loss": 0.4174896776676178 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03552267231934944, "compression/movement_sparsity/importance_threshold": -0.4015247674749428, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1624828726053238, "epoch": 4.62, "learning_rate": 2.497062547558793e-07, "loss": 0.1804, "step": 12790, "task_loss": 0.31818675994873047 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03558205849652229, "compression/movement_sparsity/importance_threshold": -0.40071583997517474, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16943714022636414, "epoch": 4.63, "learning_rate": 2.4845659210033477e-07, "loss": 0.1765, "step": 12800, "task_loss": 0.44528499245643616 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03564136485894044, "compression/movement_sparsity/importance_threshold": -0.39990799967032353, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17110618948936462, "epoch": 4.63, "learning_rate": 2.4720962056356776e-07, "loss": 0.177, "step": 12810, "task_loss": 0.3735952079296112 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03570059146027532, "compression/movement_sparsity/importance_threshold": -0.39910124582930534, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14698469638824463, "epoch": 4.63, "learning_rate": 2.4596534461072025e-07, "loss": 0.1644, "step": 12820, "task_loss": 0.2619709074497223 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03575973835419833, "compression/movement_sparsity/importance_threshold": -0.3982955777210361, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14263629913330078, "epoch": 4.64, "learning_rate": 2.4472376869728286e-07, "loss": 0.1668, "step": 12830, "task_loss": 0.20229429006576538 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03581880559438087, "compression/movement_sparsity/importance_threshold": -0.39749099461443205, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1636548638343811, "epoch": 4.64, "learning_rate": 2.4348489726907773e-07, "loss": 0.1836, "step": 12840, "task_loss": 0.6526674628257751 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03587779323449438, "compression/movement_sparsity/importance_threshold": -0.396687495778409, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17002159357070923, "epoch": 4.64, "learning_rate": 2.422487347622425e-07, "loss": 0.1933, "step": 12850, "task_loss": 0.3556768298149109 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035936701328210245, "compression/movement_sparsity/importance_threshold": -0.3958850804818833, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13383722305297852, "epoch": 4.65, "learning_rate": 2.410152856032154e-07, "loss": 0.1743, "step": 12860, "task_loss": 0.4999997615814209 }, { "compression/movement_sparsity/importance_regularization_factor": 0.035995529929199875, "compression/movement_sparsity/importance_threshold": -0.39508374799377105, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16238951683044434, "epoch": 4.65, "learning_rate": 2.397845542087177e-07, "loss": 0.1743, "step": 12870, "task_loss": 0.3286881744861603 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0360542790911347, "compression/movement_sparsity/importance_threshold": -0.3942834975829879, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1793985217809677, "epoch": 4.65, "learning_rate": 2.385565449857401e-07, "loss": 0.1743, "step": 12880, "task_loss": 0.7459797263145447 }, { "compression/movement_sparsity/importance_regularization_factor": 0.036112948867686116, "compression/movement_sparsity/importance_threshold": -0.39348432851845033, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18460488319396973, "epoch": 4.66, "learning_rate": 2.3733126233152456e-07, "loss": 0.1882, "step": 12890, "task_loss": 0.3585508465766907 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03617153931252552, "compression/movement_sparsity/importance_threshold": -0.39268624006907443, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15785962343215942, "epoch": 4.66, "learning_rate": 2.3610871063355065e-07, "loss": 0.187, "step": 12900, "task_loss": 0.519625723361969 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03623005047932436, "compression/movement_sparsity/importance_threshold": -0.3918892315037759, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14596977829933167, "epoch": 4.67, "learning_rate": 2.3488889426951907e-07, "loss": 0.1701, "step": 12910, "task_loss": 0.41778382658958435 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03628848242175402, "compression/movement_sparsity/importance_threshold": -0.3910933020914711, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17587855458259583, "epoch": 4.67, "learning_rate": 2.336718176073349e-07, "loss": 0.1728, "step": 12920, "task_loss": 0.8074888586997986 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03634683519348591, "compression/movement_sparsity/importance_threshold": -0.3902984511010759, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12496951222419739, "epoch": 4.67, "learning_rate": 2.32457485005094e-07, "loss": 0.1697, "step": 12930, "task_loss": 0.2537425756454468 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03640510884819145, "compression/movement_sparsity/importance_threshold": -0.3895046778015065, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14235326647758484, "epoch": 4.68, "learning_rate": 2.3124590081106553e-07, "loss": 0.1806, "step": 12940, "task_loss": 0.3784465789794922 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03646330343954205, "compression/movement_sparsity/importance_threshold": -0.38871198146167896, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14437982439994812, "epoch": 4.68, "learning_rate": 2.300370693636775e-07, "loss": 0.1776, "step": 12950, "task_loss": 0.41959238052368164 }, { "compression/movement_sparsity/importance_regularization_factor": 0.036521419021209124, "compression/movement_sparsity/importance_threshold": -0.38792036135050934, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1669291853904724, "epoch": 4.68, "learning_rate": 2.2883099499150116e-07, "loss": 0.1741, "step": 12960, "task_loss": 0.8136731386184692 }, { "compression/movement_sparsity/importance_regularization_factor": 0.036579455646864066, "compression/movement_sparsity/importance_threshold": -0.38712981673691366, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13533678650856018, "epoch": 4.69, "learning_rate": 2.276276820132349e-07, "loss": 0.1852, "step": 12970, "task_loss": 0.2773071527481079 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0366374133701783, "compression/movement_sparsity/importance_threshold": -0.38634034688980806, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17692357301712036, "epoch": 4.69, "learning_rate": 2.264271347376895e-07, "loss": 0.1635, "step": 12980, "task_loss": 0.3910565972328186 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03669529224482323, "compression/movement_sparsity/importance_threshold": -0.38555195107810863, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18977247178554535, "epoch": 4.69, "learning_rate": 2.252293574637717e-07, "loss": 0.1751, "step": 12990, "task_loss": 0.4595882296562195 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03675309232447028, "compression/movement_sparsity/importance_threshold": -0.38476462857073124, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1279372274875641, "epoch": 4.7, "learning_rate": 2.2403435448047014e-07, "loss": 0.1708, "step": 13000, "task_loss": 0.4735584557056427 }, { "epoch": 4.7, "eval_exact_match": 83.4720908230842, "eval_f1": 89.90774706672684, "step": 13000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03681081366279085, "compression/movement_sparsity/importance_threshold": -0.3839783786365921, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15371698141098022, "epoch": 4.7, "learning_rate": 2.228421300668386e-07, "loss": 0.1755, "step": 13010, "task_loss": 0.5671071410179138 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03686845631345635, "compression/movement_sparsity/importance_threshold": -0.38319320054460726, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14245186746120453, "epoch": 4.71, "learning_rate": 2.2165268849198205e-07, "loss": 0.1774, "step": 13020, "task_loss": 0.5267888307571411 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03692602033013819, "compression/movement_sparsity/importance_threshold": -0.3824090935636929, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1692107617855072, "epoch": 4.71, "learning_rate": 2.2046603401504082e-07, "loss": 0.1713, "step": 13030, "task_loss": 0.4950706362724304 }, { "compression/movement_sparsity/importance_regularization_factor": 0.036983505766507795, "compression/movement_sparsity/importance_threshold": -0.38162605696276486, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1489468812942505, "epoch": 4.71, "learning_rate": 2.192821708851741e-07, "loss": 0.1798, "step": 13040, "task_loss": 0.28536754846572876 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03704091267623654, "compression/movement_sparsity/importance_threshold": -0.3808440900107394, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1268996000289917, "epoch": 4.72, "learning_rate": 2.181011033415473e-07, "loss": 0.165, "step": 13050, "task_loss": 0.2892007827758789 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03709824111299588, "compression/movement_sparsity/importance_threshold": -0.3800631919765325, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1905529797077179, "epoch": 4.72, "learning_rate": 2.1692283561331414e-07, "loss": 0.1896, "step": 13060, "task_loss": 0.5020818710327148 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03715549113045719, "compression/movement_sparsity/importance_threshold": -0.3792833621290602, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11118581891059875, "epoch": 4.72, "learning_rate": 2.157473719196038e-07, "loss": 0.1721, "step": 13070, "task_loss": 0.2518026828765869 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03721266278229191, "compression/movement_sparsity/importance_threshold": -0.3785045997372386, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14366507530212402, "epoch": 4.73, "learning_rate": 2.145747164695041e-07, "loss": 0.1898, "step": 13080, "task_loss": 0.4140687584877014 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03726975612217143, "compression/movement_sparsity/importance_threshold": -0.37772690406998377, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13824304938316345, "epoch": 4.73, "learning_rate": 2.1340487346204762e-07, "loss": 0.1739, "step": 13090, "task_loss": 0.3569027781486511 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03732677120376716, "compression/movement_sparsity/importance_threshold": -0.37695027439621187, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13565212488174438, "epoch": 4.73, "learning_rate": 2.1223784708619608e-07, "loss": 0.1727, "step": 13100, "task_loss": 0.4583956003189087 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037383708080750526, "compression/movement_sparsity/importance_threshold": -0.3761747099848388, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15867725014686584, "epoch": 4.74, "learning_rate": 2.1107364152082507e-07, "loss": 0.1761, "step": 13110, "task_loss": 0.2641053795814514 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037440566806792926, "compression/movement_sparsity/importance_threshold": -0.37540021010478075, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1482830047607422, "epoch": 4.74, "learning_rate": 2.099122609347097e-07, "loss": 0.1914, "step": 13120, "task_loss": 0.7968321442604065 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03749734743556577, "compression/movement_sparsity/importance_threshold": -0.3746267740249538, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2047840803861618, "epoch": 4.75, "learning_rate": 2.0875370948650973e-07, "loss": 0.1966, "step": 13130, "task_loss": 0.49344414472579956 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03755405002074048, "compression/movement_sparsity/importance_threshold": -0.3738544010142738, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1491725742816925, "epoch": 4.75, "learning_rate": 2.0759799132475365e-07, "loss": 0.1798, "step": 13140, "task_loss": 0.4386056661605835 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03761067461598845, "compression/movement_sparsity/importance_threshold": -0.3730830903416571, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17659732699394226, "epoch": 4.75, "learning_rate": 2.0644511058782553e-07, "loss": 0.182, "step": 13150, "task_loss": 0.32818663120269775 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037667221274981115, "compression/movement_sparsity/importance_threshold": -0.3723128412760196, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16805681586265564, "epoch": 4.76, "learning_rate": 2.0529507140394798e-07, "loss": 0.1699, "step": 13160, "task_loss": 0.23117676377296448 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03772369005138986, "compression/movement_sparsity/importance_threshold": -0.37154365308627746, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15759176015853882, "epoch": 4.76, "learning_rate": 2.0414787789116994e-07, "loss": 0.175, "step": 13170, "task_loss": 0.5075294375419617 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03778008099888611, "compression/movement_sparsity/importance_threshold": -0.3707755250413466, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16308321058750153, "epoch": 4.76, "learning_rate": 2.0300353415734927e-07, "loss": 0.1767, "step": 13180, "task_loss": 0.3911818265914917 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037836394171141256, "compression/movement_sparsity/importance_threshold": -0.37000845641014346, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1465543508529663, "epoch": 4.77, "learning_rate": 2.0186204430014042e-07, "loss": 0.1699, "step": 13190, "task_loss": 0.4668183922767639 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03789262962182674, "compression/movement_sparsity/importance_threshold": -0.36924244646158355, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16283223032951355, "epoch": 4.77, "learning_rate": 2.0072341240697842e-07, "loss": 0.1668, "step": 13200, "task_loss": 0.5066724419593811 }, { "compression/movement_sparsity/importance_regularization_factor": 0.037948787404613965, "compression/movement_sparsity/importance_threshold": -0.3684774944645832, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2064799666404724, "epoch": 4.77, "learning_rate": 1.995876425550642e-07, "loss": 0.1858, "step": 13210, "task_loss": 0.6612112522125244 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03800486757317432, "compression/movement_sparsity/importance_threshold": -0.3677135996880586, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1432875096797943, "epoch": 4.78, "learning_rate": 1.9845473881135112e-07, "loss": 0.1654, "step": 13220, "task_loss": 0.21210426092147827 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03806087018117923, "compression/movement_sparsity/importance_threshold": -0.36695076140092575, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.10303863883018494, "epoch": 4.78, "learning_rate": 1.9732470523252832e-07, "loss": 0.1533, "step": 13230, "task_loss": 0.14866429567337036 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03811679528230011, "compression/movement_sparsity/importance_threshold": -0.36618897887210056, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12185746431350708, "epoch": 4.78, "learning_rate": 1.9619754586500859e-07, "loss": 0.1716, "step": 13240, "task_loss": 0.6474106311798096 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03817264293020836, "compression/movement_sparsity/importance_threshold": -0.3654282513704993, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18789950013160706, "epoch": 4.79, "learning_rate": 1.9507326474491258e-07, "loss": 0.1776, "step": 13250, "task_loss": 0.5531033873558044 }, { "epoch": 4.79, "eval_exact_match": 83.50993377483444, "eval_f1": 89.93742165413668, "step": 13250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0382284131785754, "compression/movement_sparsity/importance_threshold": -0.36466857816503795, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1531079113483429, "epoch": 4.79, "learning_rate": 1.93951865898054e-07, "loss": 0.1685, "step": 13260, "task_loss": 0.2040596902370453 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038284106081072626, "compression/movement_sparsity/importance_threshold": -0.3639099585246327, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11716160178184509, "epoch": 4.8, "learning_rate": 1.9283335333992655e-07, "loss": 0.1904, "step": 13270, "task_loss": 0.36657923460006714 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03833972169137147, "compression/movement_sparsity/importance_threshold": -0.36315239171819935, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1429518759250641, "epoch": 4.8, "learning_rate": 1.9171773107568766e-07, "loss": 0.1871, "step": 13280, "task_loss": 0.605373740196228 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03839526006314333, "compression/movement_sparsity/importance_threshold": -0.36239587701465403, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14882807433605194, "epoch": 4.8, "learning_rate": 1.906050031001466e-07, "loss": 0.1859, "step": 13290, "task_loss": 0.2876370847225189 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03845072125005961, "compression/movement_sparsity/importance_threshold": -0.3616404136829131, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15496216714382172, "epoch": 4.81, "learning_rate": 1.8949517339774746e-07, "loss": 0.1685, "step": 13300, "task_loss": 0.49225640296936035 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038506105305791745, "compression/movement_sparsity/importance_threshold": -0.36088600099189216, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14122232794761658, "epoch": 4.81, "learning_rate": 1.8838824594255708e-07, "loss": 0.1756, "step": 13310, "task_loss": 0.3297005295753479 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03856141228401112, "compression/movement_sparsity/importance_threshold": -0.36013263821050767, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.167262464761734, "epoch": 4.81, "learning_rate": 1.8728422469824977e-07, "loss": 0.1751, "step": 13320, "task_loss": 0.6074428558349609 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038616642238389155, "compression/movement_sparsity/importance_threshold": -0.35938032460767566, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13553129136562347, "epoch": 4.82, "learning_rate": 1.8618311361809324e-07, "loss": 0.1715, "step": 13330, "task_loss": 0.1803513467311859 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03867179522259727, "compression/movement_sparsity/importance_threshold": -0.35862905945231194, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14316369593143463, "epoch": 4.82, "learning_rate": 1.8508491664493465e-07, "loss": 0.1755, "step": 13340, "task_loss": 0.2262798398733139 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03872687129030685, "compression/movement_sparsity/importance_threshold": -0.35787884201333287, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16255033016204834, "epoch": 4.82, "learning_rate": 1.839896377111859e-07, "loss": 0.18, "step": 13350, "task_loss": 0.5666951537132263 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03878187049518934, "compression/movement_sparsity/importance_threshold": -0.35712967155965425, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16210150718688965, "epoch": 4.83, "learning_rate": 1.828972807388106e-07, "loss": 0.1819, "step": 13360, "task_loss": 0.6401165723800659 }, { "compression/movement_sparsity/importance_regularization_factor": 0.038836792890916114, "compression/movement_sparsity/importance_threshold": -0.35638154736019234, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1632276177406311, "epoch": 4.83, "learning_rate": 1.8180784963930928e-07, "loss": 0.1725, "step": 13370, "task_loss": 0.457368403673172 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03889163853115862, "compression/movement_sparsity/importance_threshold": -0.35563446868386306, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1731918752193451, "epoch": 4.84, "learning_rate": 1.8072134831370512e-07, "loss": 0.1622, "step": 13380, "task_loss": 0.352400541305542 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03894640746958824, "compression/movement_sparsity/importance_threshold": -0.35488843479958265, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18019556999206543, "epoch": 4.84, "learning_rate": 1.796377806525311e-07, "loss": 0.1895, "step": 13390, "task_loss": 0.44412076473236084 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03900109975987639, "compression/movement_sparsity/importance_threshold": -0.35414344497626715, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11305814981460571, "epoch": 4.84, "learning_rate": 1.7855715053581445e-07, "loss": 0.1765, "step": 13400, "task_loss": 0.5358411073684692 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03905571545569449, "compression/movement_sparsity/importance_threshold": -0.3533994984828325, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15808236598968506, "epoch": 4.85, "learning_rate": 1.7747946183306471e-07, "loss": 0.1693, "step": 13410, "task_loss": 0.43483972549438477 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03911025461071394, "compression/movement_sparsity/importance_threshold": -0.35265659458819476, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15852884948253632, "epoch": 4.85, "learning_rate": 1.764047184032579e-07, "loss": 0.184, "step": 13420, "task_loss": 0.6633070707321167 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03916471727860616, "compression/movement_sparsity/importance_threshold": -0.35191473256127015, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1737813502550125, "epoch": 4.85, "learning_rate": 1.7533292409482414e-07, "loss": 0.1646, "step": 13430, "task_loss": 0.3524114489555359 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039219103513042565, "compression/movement_sparsity/importance_threshold": -0.35117391167097467, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15978504717350006, "epoch": 4.86, "learning_rate": 1.7426408274563343e-07, "loss": 0.177, "step": 13440, "task_loss": 0.5557321906089783 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03927341336769454, "compression/movement_sparsity/importance_threshold": -0.35043413118622435, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1573951095342636, "epoch": 4.86, "learning_rate": 1.7319819818298166e-07, "loss": 0.1765, "step": 13450, "task_loss": 0.5514523983001709 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03932764689623351, "compression/movement_sparsity/importance_threshold": -0.34969539037593544, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16307415068149567, "epoch": 4.86, "learning_rate": 1.7213527422357732e-07, "loss": 0.1692, "step": 13460, "task_loss": 0.3270409107208252 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039381804152330925, "compression/movement_sparsity/importance_threshold": -0.3489576885090234, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1804170459508896, "epoch": 4.87, "learning_rate": 1.7107531467352697e-07, "loss": 0.1793, "step": 13470, "task_loss": 0.3031477630138397 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03943588518965812, "compression/movement_sparsity/importance_threshold": -0.3482210248544052, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.176797017455101, "epoch": 4.87, "learning_rate": 1.70018323328323e-07, "loss": 0.1721, "step": 13480, "task_loss": 0.528610348701477 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03948989006188655, "compression/movement_sparsity/importance_threshold": -0.34748539868099637, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14670194685459137, "epoch": 4.88, "learning_rate": 1.6896430397282914e-07, "loss": 0.1938, "step": 13490, "task_loss": 0.41166549921035767 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039543818822687635, "compression/movement_sparsity/importance_threshold": -0.34675080925771296, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20626819133758545, "epoch": 4.88, "learning_rate": 1.679132603812663e-07, "loss": 0.1797, "step": 13500, "task_loss": 0.8171520233154297 }, { "epoch": 4.88, "eval_exact_match": 83.61400189214758, "eval_f1": 90.0307786000334, "step": 13500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039597671525732764, "compression/movement_sparsity/importance_threshold": -0.3460172558534711, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15772680938243866, "epoch": 4.88, "learning_rate": 1.6686519631720098e-07, "loss": 0.1889, "step": 13510, "task_loss": 0.24707984924316406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03965144822469335, "compression/movement_sparsity/importance_threshold": -0.34528473773718693, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17334116995334625, "epoch": 4.89, "learning_rate": 1.658201155335295e-07, "loss": 0.1797, "step": 13520, "task_loss": 0.5400213003158569 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03970514897324082, "compression/movement_sparsity/importance_threshold": -0.3445532541777765, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1472284197807312, "epoch": 4.89, "learning_rate": 1.6477802177246646e-07, "loss": 0.171, "step": 13530, "task_loss": 0.5143538117408752 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03975877382504656, "compression/movement_sparsity/importance_threshold": -0.3438228044441558, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17164795100688934, "epoch": 4.89, "learning_rate": 1.637389187655306e-07, "loss": 0.1743, "step": 13540, "task_loss": 0.505851149559021 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039812322833782, "compression/movement_sparsity/importance_threshold": -0.343093387805241, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17246219515800476, "epoch": 4.9, "learning_rate": 1.627028102335305e-07, "loss": 0.1844, "step": 13550, "task_loss": 0.43365949392318726 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039865796053118525, "compression/movement_sparsity/importance_threshold": -0.34236500352994836, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17335930466651917, "epoch": 4.9, "learning_rate": 1.616696998865531e-07, "loss": 0.1656, "step": 13560, "task_loss": 0.40370243787765503 }, { "compression/movement_sparsity/importance_regularization_factor": 0.039919193536727585, "compression/movement_sparsity/importance_threshold": -0.34163765088719333, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16594436764717102, "epoch": 4.9, "learning_rate": 1.60639591423949e-07, "loss": 0.1728, "step": 13570, "task_loss": 0.3566591441631317 }, { "compression/movement_sparsity/importance_regularization_factor": 0.03997251533828056, "compression/movement_sparsity/importance_threshold": -0.3409113291458926, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1836821734905243, "epoch": 4.91, "learning_rate": 1.596124885343203e-07, "loss": 0.1792, "step": 13580, "task_loss": 0.3985489308834076 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04002576151144888, "compression/movement_sparsity/importance_threshold": -0.3401860375749618, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22170782089233398, "epoch": 4.91, "learning_rate": 1.5858839489550546e-07, "loss": 0.1794, "step": 13590, "task_loss": 0.5840495824813843 }, { "compression/movement_sparsity/importance_regularization_factor": 0.040078932109903945, "compression/movement_sparsity/importance_threshold": -0.33946177544331735, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18641307950019836, "epoch": 4.92, "learning_rate": 1.575673141745689e-07, "loss": 0.176, "step": 13600, "task_loss": 0.5444910526275635 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04013202718731717, "compression/movement_sparsity/importance_threshold": -0.33873854201987497, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17240336537361145, "epoch": 4.92, "learning_rate": 1.5654925002778574e-07, "loss": 0.1887, "step": 13610, "task_loss": 0.4279804825782776 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04018504679735995, "compression/movement_sparsity/importance_threshold": -0.3380163365735511, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16231310367584229, "epoch": 4.92, "learning_rate": 1.5553420610062905e-07, "loss": 0.1801, "step": 13620, "task_loss": 0.3823118805885315 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04023799099370372, "compression/movement_sparsity/importance_threshold": -0.3372951583732614, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16496533155441284, "epoch": 4.93, "learning_rate": 1.54522186027758e-07, "loss": 0.1814, "step": 13630, "task_loss": 0.3801088035106659 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04029085983001987, "compression/movement_sparsity/importance_threshold": -0.3365750066879223, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1552436500787735, "epoch": 4.93, "learning_rate": 1.5351319343300294e-07, "loss": 0.1735, "step": 13640, "task_loss": 0.8039931058883667 }, { "compression/movement_sparsity/importance_regularization_factor": 0.040343653359979824, "compression/movement_sparsity/importance_threshold": -0.3358558807864498, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1522938311100006, "epoch": 4.93, "learning_rate": 1.5250723192935433e-07, "loss": 0.1814, "step": 13650, "task_loss": 0.6675360798835754 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04039637163725498, "compression/movement_sparsity/importance_threshold": -0.3351377799377597, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17480003833770752, "epoch": 4.94, "learning_rate": 1.5150430511894862e-07, "loss": 0.1833, "step": 13660, "task_loss": 0.4187951683998108 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04044901471551676, "compression/movement_sparsity/importance_threshold": -0.33442070341076835, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13274918496608734, "epoch": 4.94, "learning_rate": 1.5050441659305558e-07, "loss": 0.1728, "step": 13670, "task_loss": 0.46560001373291016 }, { "compression/movement_sparsity/importance_regularization_factor": 0.040501582648436577, "compression/movement_sparsity/importance_threshold": -0.3337046504743917, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18608583509922028, "epoch": 4.94, "learning_rate": 1.495075699320658e-07, "loss": 0.1855, "step": 13680, "task_loss": 0.5671523213386536 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04055407548968582, "compression/movement_sparsity/importance_threshold": -0.332989620397546, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19633609056472778, "epoch": 4.95, "learning_rate": 1.4851376870547705e-07, "loss": 0.1814, "step": 13690, "task_loss": 0.31386256217956543 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04060649329293593, "compression/movement_sparsity/importance_threshold": -0.33227561244914694, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17011404037475586, "epoch": 4.95, "learning_rate": 1.475230164718827e-07, "loss": 0.1851, "step": 13700, "task_loss": 0.3444925546646118 }, { "compression/movement_sparsity/importance_regularization_factor": 0.040658836111858304, "compression/movement_sparsity/importance_threshold": -0.3315626258981109, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17365840077400208, "epoch": 4.95, "learning_rate": 1.4653531677895748e-07, "loss": 0.1759, "step": 13710, "task_loss": 0.4338461756706238 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04071110400012435, "compression/movement_sparsity/importance_threshold": -0.3308506600133537, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16433964669704437, "epoch": 4.96, "learning_rate": 1.455506731634466e-07, "loss": 0.1771, "step": 13720, "task_loss": 0.1609778255224228 }, { "compression/movement_sparsity/importance_regularization_factor": 0.040763297011405464, "compression/movement_sparsity/importance_threshold": -0.33013971406379183, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16652318835258484, "epoch": 4.96, "learning_rate": 1.445690891511515e-07, "loss": 0.1792, "step": 13730, "task_loss": 0.5048503279685974 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04081541519937309, "compression/movement_sparsity/importance_threshold": -0.3294297873183408, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17034050822257996, "epoch": 4.97, "learning_rate": 1.4359056825691785e-07, "loss": 0.1854, "step": 13740, "task_loss": 0.6351929903030396 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0408674586176986, "compression/movement_sparsity/importance_threshold": -0.3287208790459173, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14872363209724426, "epoch": 4.97, "learning_rate": 1.4261511398462333e-07, "loss": 0.1821, "step": 13750, "task_loss": 0.3965833783149719 }, { "epoch": 4.97, "eval_exact_match": 83.62346263008514, "eval_f1": 89.91764481115209, "step": 13750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04091942732005344, "compression/movement_sparsity/importance_threshold": -0.3280129885154368, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14365430176258087, "epoch": 4.97, "learning_rate": 1.4164272982716385e-07, "loss": 0.1755, "step": 13760, "task_loss": 0.46597981452941895 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04097132136010901, "compression/movement_sparsity/importance_threshold": -0.32730611499581574, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13839992880821228, "epoch": 4.98, "learning_rate": 1.4067341926644283e-07, "loss": 0.172, "step": 13770, "task_loss": 0.33191508054733276 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04102314079153671, "compression/movement_sparsity/importance_threshold": -0.32660025775597, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14204658567905426, "epoch": 4.98, "learning_rate": 1.3970718577335728e-07, "loss": 0.1765, "step": 13780, "task_loss": 0.2808571457862854 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04107488566800795, "compression/movement_sparsity/importance_threshold": -0.32589541606481587, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1467570811510086, "epoch": 4.98, "learning_rate": 1.3874403280778602e-07, "loss": 0.1718, "step": 13790, "task_loss": 0.19776900112628937 }, { "compression/movement_sparsity/importance_regularization_factor": 0.041126556043194165, "compression/movement_sparsity/importance_threshold": -0.325191589191269, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16088761389255524, "epoch": 4.99, "learning_rate": 1.377839638185774e-07, "loss": 0.1789, "step": 13800, "task_loss": 0.6403120756149292 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04117815197076674, "compression/movement_sparsity/importance_threshold": -0.3244887764042459, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15906648337841034, "epoch": 4.99, "learning_rate": 1.3682698224353584e-07, "loss": 0.1813, "step": 13810, "task_loss": 0.4403289556503296 }, { "compression/movement_sparsity/importance_regularization_factor": 0.041229673504397095, "compression/movement_sparsity/importance_threshold": -0.32378697697266245, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1257646381855011, "epoch": 4.99, "learning_rate": 1.3587309150941152e-07, "loss": 0.177, "step": 13820, "task_loss": 0.3140348196029663 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04128112069775663, "compression/movement_sparsity/importance_threshold": -0.32308619016543483, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21181027591228485, "epoch": 5.0, "learning_rate": 1.349222950318859e-07, "loss": 0.1915, "step": 13830, "task_loss": 0.4248882830142975 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04133249360451678, "compression/movement_sparsity/importance_threshold": -0.3223864152514787, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1628672480583191, "epoch": 5.0, "learning_rate": 1.3397459621556128e-07, "loss": 0.1719, "step": 13840, "task_loss": 0.3244702219963074 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04138379227834893, "compression/movement_sparsity/importance_threshold": -0.3216876514997108, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13734124600887299, "epoch": 5.01, "learning_rate": 1.3302999845394802e-07, "loss": 0.1865, "step": 13850, "task_loss": 0.36722129583358765 }, { "compression/movement_sparsity/importance_regularization_factor": 0.041435016772924514, "compression/movement_sparsity/importance_threshold": -0.32098989817904644, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1720079779624939, "epoch": 5.01, "learning_rate": 1.3208850512945135e-07, "loss": 0.1831, "step": 13860, "task_loss": 0.3153786063194275 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04148616714191491, "compression/movement_sparsity/importance_threshold": -0.32029315455840246, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1428808569908142, "epoch": 5.01, "learning_rate": 1.311501196133612e-07, "loss": 0.1775, "step": 13870, "task_loss": 0.6260837912559509 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04153724343899157, "compression/movement_sparsity/importance_threshold": -0.3195974199066942, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14632534980773926, "epoch": 5.02, "learning_rate": 1.3021484526583814e-07, "loss": 0.1824, "step": 13880, "task_loss": 0.2916293740272522 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04158824571782587, "compression/movement_sparsity/importance_threshold": -0.3189026934928383, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15432986617088318, "epoch": 5.02, "learning_rate": 1.2928268543590304e-07, "loss": 0.1779, "step": 13890, "task_loss": 0.4513210654258728 }, { "compression/movement_sparsity/importance_regularization_factor": 0.041639174032089235, "compression/movement_sparsity/importance_threshold": -0.3182089745857505, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1750912070274353, "epoch": 5.02, "learning_rate": 1.2835364346142397e-07, "loss": 0.1778, "step": 13900, "task_loss": 0.5073824524879456 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04169002843545308, "compression/movement_sparsity/importance_threshold": -0.31751626245434694, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19097644090652466, "epoch": 5.03, "learning_rate": 1.2742772266910485e-07, "loss": 0.1807, "step": 13910, "task_loss": 0.4349386692047119 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04174080898158881, "compression/movement_sparsity/importance_threshold": -0.3168245563675438, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.185117706656456, "epoch": 5.03, "learning_rate": 1.265049263744734e-07, "loss": 0.1708, "step": 13920, "task_loss": 0.3032917380332947 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04179151572416783, "compression/movement_sparsity/importance_threshold": -0.3161338555942569, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1254778802394867, "epoch": 5.03, "learning_rate": 1.2558525788186834e-07, "loss": 0.1649, "step": 13930, "task_loss": 0.5277553200721741 }, { "compression/movement_sparsity/importance_regularization_factor": 0.041842148716861566, "compression/movement_sparsity/importance_threshold": -0.31544415940340254, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1639249622821808, "epoch": 5.04, "learning_rate": 1.2466872048442935e-07, "loss": 0.1671, "step": 13940, "task_loss": 0.4619181752204895 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04189270801334141, "compression/movement_sparsity/importance_threshold": -0.3147554670638968, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15116430819034576, "epoch": 5.04, "learning_rate": 1.237553174640842e-07, "loss": 0.1769, "step": 13950, "task_loss": 0.2680996358394623 }, { "compression/movement_sparsity/importance_regularization_factor": 0.041943193667278784, "compression/movement_sparsity/importance_threshold": -0.31406777784465545, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15021368861198425, "epoch": 5.05, "learning_rate": 1.228450520915364e-07, "loss": 0.1827, "step": 13960, "task_loss": 0.4265400767326355 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0419936057323451, "compression/movement_sparsity/importance_threshold": -0.3133810910145949, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1929803192615509, "epoch": 5.05, "learning_rate": 1.21937927626255e-07, "loss": 0.1773, "step": 13970, "task_loss": 0.5100224614143372 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04204394426221176, "compression/movement_sparsity/importance_threshold": -0.312695405842631, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15454885363578796, "epoch": 5.05, "learning_rate": 1.2103394731646143e-07, "loss": 0.1715, "step": 13980, "task_loss": 0.6407804489135742 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04209420931055017, "compression/movement_sparsity/importance_threshold": -0.3120107215976802, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17369097471237183, "epoch": 5.06, "learning_rate": 1.2013311439911954e-07, "loss": 0.1763, "step": 13990, "task_loss": 0.4219602346420288 }, { "compression/movement_sparsity/importance_regularization_factor": 0.042144400931031764, "compression/movement_sparsity/importance_threshold": -0.31132703754865787, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1915154606103897, "epoch": 5.06, "learning_rate": 1.1923543209992183e-07, "loss": 0.1761, "step": 14000, "task_loss": 0.3387451767921448 }, { "epoch": 5.06, "eval_exact_match": 83.67076631977294, "eval_f1": 90.03655684421615, "step": 14000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04219451917732794, "compression/movement_sparsity/importance_threshold": -0.31064435296448056, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.153361976146698, "epoch": 5.06, "learning_rate": 1.1834090363327986e-07, "loss": 0.1855, "step": 14010, "task_loss": 0.576560378074646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.042244564103110095, "compression/movement_sparsity/importance_threshold": -0.3099626671140644, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14965105056762695, "epoch": 5.07, "learning_rate": 1.174495322023118e-07, "loss": 0.1879, "step": 14020, "task_loss": 0.4468023478984833 }, { "compression/movement_sparsity/importance_regularization_factor": 0.042294535762049654, "compression/movement_sparsity/importance_threshold": -0.3092819792663253, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15595000982284546, "epoch": 5.07, "learning_rate": 1.1656132099883131e-07, "loss": 0.1716, "step": 14030, "task_loss": 0.15840017795562744 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04234443420781801, "compression/movement_sparsity/importance_threshold": -0.30860228869017936, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.147873193025589, "epoch": 5.07, "learning_rate": 1.1567627320333594e-07, "loss": 0.1864, "step": 14040, "task_loss": 0.8370780348777771 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04239425949408662, "compression/movement_sparsity/importance_threshold": -0.30792359465454233, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15930218994617462, "epoch": 5.08, "learning_rate": 1.1479439198499519e-07, "loss": 0.179, "step": 14050, "task_loss": 0.4646834135055542 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04244401167452685, "compression/movement_sparsity/importance_threshold": -0.3072458964283309, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15202897787094116, "epoch": 5.08, "learning_rate": 1.1391568050164014e-07, "loss": 0.1753, "step": 14060, "task_loss": 0.4561488628387451 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04249369080281012, "compression/movement_sparsity/importance_threshold": -0.3065691932804606, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18747982382774353, "epoch": 5.08, "learning_rate": 1.1304014189975197e-07, "loss": 0.1802, "step": 14070, "task_loss": 0.33272579312324524 }, { "compression/movement_sparsity/importance_regularization_factor": 0.042543296932607846, "compression/movement_sparsity/importance_threshold": -0.30589348447984777, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13845214247703552, "epoch": 5.09, "learning_rate": 1.1216777931444987e-07, "loss": 0.1822, "step": 14080, "task_loss": 0.27083098888397217 }, { "compression/movement_sparsity/importance_regularization_factor": 0.042592830117591435, "compression/movement_sparsity/importance_threshold": -0.3052187692954085, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16344168782234192, "epoch": 5.09, "learning_rate": 1.1129859586948098e-07, "loss": 0.1797, "step": 14090, "task_loss": 0.4010601043701172 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04264229041143231, "compression/movement_sparsity/importance_threshold": -0.30454504699605844, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15759651362895966, "epoch": 5.1, "learning_rate": 1.1043259467720778e-07, "loss": 0.1782, "step": 14100, "task_loss": 0.30105510354042053 }, { "compression/movement_sparsity/importance_regularization_factor": 0.042691677867801855, "compression/movement_sparsity/importance_threshold": -0.30387231685071436, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17411063611507416, "epoch": 5.1, "learning_rate": 1.0956977883859886e-07, "loss": 0.1891, "step": 14110, "task_loss": 0.466052770614624 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0427409925403715, "compression/movement_sparsity/importance_threshold": -0.3032005781282918, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11521792411804199, "epoch": 5.1, "learning_rate": 1.0871015144321571e-07, "loss": 0.1681, "step": 14120, "task_loss": 0.5160799026489258 }, { "compression/movement_sparsity/importance_regularization_factor": 0.042790234482812675, "compression/movement_sparsity/importance_threshold": -0.30252983009770684, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1370161473751068, "epoch": 5.11, "learning_rate": 1.078537155692032e-07, "loss": 0.1707, "step": 14130, "task_loss": 0.19232416152954102 }, { "compression/movement_sparsity/importance_regularization_factor": 0.042839403748796745, "compression/movement_sparsity/importance_threshold": -0.3018600720278759, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14609548449516296, "epoch": 5.11, "learning_rate": 1.0700047428327818e-07, "loss": 0.1588, "step": 14140, "task_loss": 0.476870059967041 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04288850039199515, "compression/movement_sparsity/importance_threshold": -0.3011913031877147, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13242539763450623, "epoch": 5.11, "learning_rate": 1.0615043064071783e-07, "loss": 0.1783, "step": 14150, "task_loss": 0.4068664014339447 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04293752446607929, "compression/movement_sparsity/importance_threshold": -0.3005235228461395, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15072208642959595, "epoch": 5.12, "learning_rate": 1.0530358768534997e-07, "loss": 0.1821, "step": 14160, "task_loss": 0.6981538534164429 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04298647602472058, "compression/movement_sparsity/importance_threshold": -0.29985673027206616, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17192161083221436, "epoch": 5.12, "learning_rate": 1.0445994844954064e-07, "loss": 0.1739, "step": 14170, "task_loss": 0.5850452184677124 }, { "compression/movement_sparsity/importance_regularization_factor": 0.043035355121590436, "compression/movement_sparsity/importance_threshold": -0.299190924734411, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1931232213973999, "epoch": 5.12, "learning_rate": 1.0361951595418439e-07, "loss": 0.1794, "step": 14180, "task_loss": 0.3741414248943329 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04308416181036027, "compression/movement_sparsity/importance_threshold": -0.29852610550209, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14067162573337555, "epoch": 5.13, "learning_rate": 1.0278229320869336e-07, "loss": 0.1783, "step": 14190, "task_loss": 0.3252699673175812 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04313289614470146, "compression/movement_sparsity/importance_threshold": -0.29786227184401926, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21310460567474365, "epoch": 5.13, "learning_rate": 1.0194828321098569e-07, "loss": 0.1935, "step": 14200, "task_loss": 0.5130503177642822 }, { "compression/movement_sparsity/importance_regularization_factor": 0.043181558178285465, "compression/movement_sparsity/importance_threshold": -0.2971994230291146, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17947247624397278, "epoch": 5.14, "learning_rate": 1.0111748894747596e-07, "loss": 0.1843, "step": 14210, "task_loss": 0.36428236961364746 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04323014796478367, "compression/movement_sparsity/importance_threshold": -0.29653755832629225, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12573717534542084, "epoch": 5.14, "learning_rate": 1.0028991339306336e-07, "loss": 0.1788, "step": 14220, "task_loss": 0.6074889898300171 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04327866555786748, "compression/movement_sparsity/importance_threshold": -0.2958766770044685, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14682647585868835, "epoch": 5.14, "learning_rate": 9.946555951112178e-08, "loss": 0.185, "step": 14230, "task_loss": 0.3594474196434021 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04332711101120833, "compression/movement_sparsity/importance_threshold": -0.2952167783325591, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11430035531520844, "epoch": 5.15, "learning_rate": 9.864443025348934e-08, "loss": 0.169, "step": 14240, "task_loss": 0.2298082858324051 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0433754843784776, "compression/movement_sparsity/importance_threshold": -0.2945578615794803, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1614457666873932, "epoch": 5.15, "learning_rate": 9.782652856045648e-08, "loss": 0.1708, "step": 14250, "task_loss": 0.7426341772079468 }, { "epoch": 5.15, "eval_exact_match": 83.74645222327341, "eval_f1": 90.08076277207161, "step": 14250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04342378571334672, "compression/movement_sparsity/importance_threshold": -0.29389992601414805, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1826505959033966, "epoch": 5.15, "learning_rate": 9.701185736075756e-08, "loss": 0.1722, "step": 14260, "task_loss": 0.6813669800758362 }, { "compression/movement_sparsity/importance_regularization_factor": 0.043472015069487085, "compression/movement_sparsity/importance_threshold": -0.2932429709054786, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1705678105354309, "epoch": 5.16, "learning_rate": 9.620041957155834e-08, "loss": 0.1868, "step": 14270, "task_loss": 0.464776873588562 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04352017250057013, "compression/movement_sparsity/importance_threshold": -0.29258699552238776, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16456995904445648, "epoch": 5.16, "learning_rate": 9.539221809844722e-08, "loss": 0.1777, "step": 14280, "task_loss": 0.4324108362197876 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04356825806026725, "compression/movement_sparsity/importance_threshold": -0.29193199913379164, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17111265659332275, "epoch": 5.16, "learning_rate": 9.458725583542315e-08, "loss": 0.1817, "step": 14290, "task_loss": 0.4643493592739105 }, { "compression/movement_sparsity/importance_regularization_factor": 0.043616271802249855, "compression/movement_sparsity/importance_threshold": -0.29127798100860647, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1873432695865631, "epoch": 5.17, "learning_rate": 9.378553566488668e-08, "loss": 0.1764, "step": 14300, "task_loss": 0.4464597702026367 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04366421378018936, "compression/movement_sparsity/importance_threshold": -0.29062494041574827, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1443527191877365, "epoch": 5.17, "learning_rate": 9.298706045762927e-08, "loss": 0.1753, "step": 14310, "task_loss": 0.3243370056152344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04371208404775718, "compression/movement_sparsity/importance_threshold": -0.28997287662413296, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13130953907966614, "epoch": 5.18, "learning_rate": 9.219183307282219e-08, "loss": 0.1697, "step": 14320, "task_loss": 0.5415463447570801 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04375988265862469, "compression/movement_sparsity/importance_threshold": -0.2893217889026771, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2080850899219513, "epoch": 5.18, "learning_rate": 9.139985635800784e-08, "loss": 0.1908, "step": 14330, "task_loss": 0.3747154474258423 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04380760966646336, "compression/movement_sparsity/importance_threshold": -0.288671676520296, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14363014698028564, "epoch": 5.18, "learning_rate": 9.061113314908764e-08, "loss": 0.1722, "step": 14340, "task_loss": 0.6961146593093872 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04385526512494456, "compression/movement_sparsity/importance_threshold": -0.2880225387459062, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16117852926254272, "epoch": 5.19, "learning_rate": 8.982566627031363e-08, "loss": 0.1782, "step": 14350, "task_loss": 0.3901559114456177 }, { "compression/movement_sparsity/importance_regularization_factor": 0.043902849087739705, "compression/movement_sparsity/importance_threshold": -0.28737437484842376, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1831967532634735, "epoch": 5.19, "learning_rate": 8.904345853427753e-08, "loss": 0.1756, "step": 14360, "task_loss": 0.4879865348339081 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04395036160852022, "compression/movement_sparsity/importance_threshold": -0.28672718409676456, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13761597871780396, "epoch": 5.19, "learning_rate": 8.826451274190039e-08, "loss": 0.1651, "step": 14370, "task_loss": 0.3214653432369232 }, { "compression/movement_sparsity/importance_regularization_factor": 0.043997802740957506, "compression/movement_sparsity/importance_threshold": -0.28608096575984476, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2639523446559906, "epoch": 5.2, "learning_rate": 8.748883168242327e-08, "loss": 0.1827, "step": 14380, "task_loss": 0.5773433446884155 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04404517253872297, "compression/movement_sparsity/importance_threshold": -0.2854357191065805, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12245256453752518, "epoch": 5.2, "learning_rate": 8.671641813339681e-08, "loss": 0.18, "step": 14390, "task_loss": 0.33123326301574707 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04409247105548803, "compression/movement_sparsity/importance_threshold": -0.2847914434058878, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17285960912704468, "epoch": 5.2, "learning_rate": 8.594727486067155e-08, "loss": 0.1814, "step": 14400, "task_loss": 0.43380165100097656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04413969834492409, "compression/movement_sparsity/importance_threshold": -0.2841481379266828, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18825030326843262, "epoch": 5.21, "learning_rate": 8.518140461838729e-08, "loss": 0.1752, "step": 14410, "task_loss": 0.38604140281677246 }, { "compression/movement_sparsity/importance_regularization_factor": 0.044186854460702585, "compression/movement_sparsity/importance_threshold": -0.2835058019378811, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1609213650226593, "epoch": 5.21, "learning_rate": 8.441881014896434e-08, "loss": 0.1699, "step": 14420, "task_loss": 0.5970258116722107 }, { "compression/movement_sparsity/importance_regularization_factor": 0.044233939456494877, "compression/movement_sparsity/importance_threshold": -0.2828644347083996, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14135953783988953, "epoch": 5.22, "learning_rate": 8.365949418309327e-08, "loss": 0.1687, "step": 14430, "task_loss": 0.2799544334411621 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04428095338597242, "compression/movement_sparsity/importance_threshold": -0.2822240355071536, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15112349390983582, "epoch": 5.22, "learning_rate": 8.290345943972433e-08, "loss": 0.1782, "step": 14440, "task_loss": 0.41910994052886963 }, { "compression/movement_sparsity/importance_regularization_factor": 0.044327896302806605, "compression/movement_sparsity/importance_threshold": -0.2815846036030596, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19787323474884033, "epoch": 5.22, "learning_rate": 8.215070862605922e-08, "loss": 0.1737, "step": 14450, "task_loss": 0.4692561626434326 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04437476826066885, "compression/movement_sparsity/importance_threshold": -0.28094613826503345, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18470437824726105, "epoch": 5.23, "learning_rate": 8.140124443753982e-08, "loss": 0.1712, "step": 14460, "task_loss": 0.45406365394592285 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04442156931323056, "compression/movement_sparsity/importance_threshold": -0.2803086387619913, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13995935022830963, "epoch": 5.23, "learning_rate": 8.065506955783985e-08, "loss": 0.1751, "step": 14470, "task_loss": 0.5951769948005676 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04446829951416315, "compression/movement_sparsity/importance_threshold": -0.2796721043628493, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21652866899967194, "epoch": 5.23, "learning_rate": 7.991218665885458e-08, "loss": 0.1869, "step": 14480, "task_loss": 0.4158879518508911 }, { "compression/movement_sparsity/importance_regularization_factor": 0.044514958917138026, "compression/movement_sparsity/importance_threshold": -0.27903653433652353, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16511324048042297, "epoch": 5.24, "learning_rate": 7.917259840069112e-08, "loss": 0.1711, "step": 14490, "task_loss": 0.42348212003707886 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04456154757582661, "compression/movement_sparsity/importance_threshold": -0.27840192795192975, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18495193123817444, "epoch": 5.24, "learning_rate": 7.843630743165952e-08, "loss": 0.1812, "step": 14500, "task_loss": 0.6287417411804199 }, { "epoch": 5.24, "eval_exact_match": 83.6329233680227, "eval_f1": 90.02631898119101, "step": 14500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.044608065543900295, "compression/movement_sparsity/importance_threshold": -0.2777682844779844, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.179265558719635, "epoch": 5.24, "learning_rate": 7.770331638826266e-08, "loss": 0.1807, "step": 14510, "task_loss": 0.5263575315475464 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04465451287503051, "compression/movement_sparsity/importance_threshold": -0.2771356031836033, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1451658308506012, "epoch": 5.25, "learning_rate": 7.697362789518757e-08, "loss": 0.1739, "step": 14520, "task_loss": 0.36418962478637695 }, { "compression/movement_sparsity/importance_regularization_factor": 0.044700889622888644, "compression/movement_sparsity/importance_threshold": -0.2765038833377026, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1474297046661377, "epoch": 5.25, "learning_rate": 7.624724456529475e-08, "loss": 0.1855, "step": 14530, "task_loss": 0.3640234172344208 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04474719584114613, "compression/movement_sparsity/importance_threshold": -0.2758731242091984, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13829626142978668, "epoch": 5.25, "learning_rate": 7.552416899961011e-08, "loss": 0.1974, "step": 14540, "task_loss": 0.28583553433418274 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04479343158347436, "compression/movement_sparsity/importance_threshold": -0.2752433250670069, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1722092181444168, "epoch": 5.26, "learning_rate": 7.48762312690956e-08, "loss": 0.1966, "step": 14550, "task_loss": 0.3825565278530121 }, { "compression/movement_sparsity/importance_regularization_factor": 0.044839596903544755, "compression/movement_sparsity/importance_threshold": -0.27461448518004383, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1537836194038391, "epoch": 5.26, "learning_rate": 7.415944757880465e-08, "loss": 0.1829, "step": 14560, "task_loss": 0.42369458079338074 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04488569185502873, "compression/movement_sparsity/importance_threshold": -0.2739866038172255, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15801842510700226, "epoch": 5.27, "learning_rate": 7.344597912868367e-08, "loss": 0.1636, "step": 14570, "task_loss": 0.5285253524780273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.044931716491597676, "compression/movement_sparsity/importance_threshold": -0.27335968024746793, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15832439064979553, "epoch": 5.27, "learning_rate": 7.273582847351289e-08, "loss": 0.1715, "step": 14580, "task_loss": 0.4416083097457886 }, { "compression/movement_sparsity/importance_regularization_factor": 0.044977670866923024, "compression/movement_sparsity/importance_threshold": -0.27273371373968713, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18099182844161987, "epoch": 5.27, "learning_rate": 7.202899815619234e-08, "loss": 0.1769, "step": 14590, "task_loss": 0.6669715046882629 }, { "compression/movement_sparsity/importance_regularization_factor": 0.045023555034676165, "compression/movement_sparsity/importance_threshold": -0.27210870356279937, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1620885580778122, "epoch": 5.28, "learning_rate": 7.132549070773286e-08, "loss": 0.1729, "step": 14600, "task_loss": 0.25877806544303894 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04506936904852855, "compression/movement_sparsity/importance_threshold": -0.2714846489857202, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14744767546653748, "epoch": 5.28, "learning_rate": 7.062530864724625e-08, "loss": 0.1746, "step": 14610, "task_loss": 0.5256980061531067 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04511511296215154, "compression/movement_sparsity/importance_threshold": -0.2708615492773664, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18050046265125275, "epoch": 5.28, "learning_rate": 6.99284544819373e-08, "loss": 0.1755, "step": 14620, "task_loss": 0.4965248703956604 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04516078682921657, "compression/movement_sparsity/importance_threshold": -0.27023940370665356, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15234878659248352, "epoch": 5.29, "learning_rate": 6.923493070709397e-08, "loss": 0.1781, "step": 14630, "task_loss": 0.3228938579559326 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04520639070339504, "compression/movement_sparsity/importance_threshold": -0.2696182115424979, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16729338467121124, "epoch": 5.29, "learning_rate": 6.85447398060791e-08, "loss": 0.1709, "step": 14640, "task_loss": 0.8934930562973022 }, { "compression/movement_sparsity/importance_regularization_factor": 0.045251924638358386, "compression/movement_sparsity/importance_threshold": -0.26899797205381526, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15839503705501556, "epoch": 5.29, "learning_rate": 6.785788425032124e-08, "loss": 0.1794, "step": 14650, "task_loss": 0.33643460273742676 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04529738868777799, "compression/movement_sparsity/importance_threshold": -0.2683786845095222, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19412821531295776, "epoch": 5.3, "learning_rate": 6.717436649930508e-08, "loss": 0.1924, "step": 14660, "task_loss": 0.9410861134529114 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04534278290532528, "compression/movement_sparsity/importance_threshold": -0.2677603481785342, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12971442937850952, "epoch": 5.3, "learning_rate": 6.649418900056425e-08, "loss": 0.1753, "step": 14670, "task_loss": 0.26334819197654724 }, { "compression/movement_sparsity/importance_regularization_factor": 0.045388107344671656, "compression/movement_sparsity/importance_threshold": -0.26714296232976775, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1588962972164154, "epoch": 5.31, "learning_rate": 6.581735418967094e-08, "loss": 0.1962, "step": 14680, "task_loss": 0.35144782066345215 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04543336205948853, "compression/movement_sparsity/importance_threshold": -0.2665265262321388, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15858691930770874, "epoch": 5.31, "learning_rate": 6.514386449022846e-08, "loss": 0.1875, "step": 14690, "task_loss": 0.45107176899909973 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04547854710344731, "compression/movement_sparsity/importance_threshold": -0.2659110391545635, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15156474709510803, "epoch": 5.31, "learning_rate": 6.447372231386138e-08, "loss": 0.1698, "step": 14700, "task_loss": 0.41857224702835083 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04552366253021943, "compression/movement_sparsity/importance_threshold": -0.26529650036595753, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1653667539358139, "epoch": 5.32, "learning_rate": 6.380693006020788e-08, "loss": 0.1625, "step": 14710, "task_loss": 0.32113200426101685 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04556870839347626, "compression/movement_sparsity/importance_threshold": -0.2646829091352375, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1332647204399109, "epoch": 5.32, "learning_rate": 6.3143490116911e-08, "loss": 0.164, "step": 14720, "task_loss": 0.2849538326263428 }, { "compression/movement_sparsity/importance_regularization_factor": 0.045613684746889246, "compression/movement_sparsity/importance_threshold": -0.264070264731319, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1563533991575241, "epoch": 5.32, "learning_rate": 6.248340485960912e-08, "loss": 0.177, "step": 14730, "task_loss": 0.27913862466812134 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04565859164412978, "compression/movement_sparsity/importance_threshold": -0.2634585664231185, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15281318128108978, "epoch": 5.33, "learning_rate": 6.182667665192876e-08, "loss": 0.1773, "step": 14740, "task_loss": 0.2936190664768219 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04570342913886928, "compression/movement_sparsity/importance_threshold": -0.2628478134795518, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15253296494483948, "epoch": 5.33, "learning_rate": 6.117330784547547e-08, "loss": 0.179, "step": 14750, "task_loss": 0.4092378616333008 }, { "epoch": 5.33, "eval_exact_match": 83.69914853358561, "eval_f1": 90.03206384226705, "step": 14750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04574819728477916, "compression/movement_sparsity/importance_threshold": -0.262238005169535, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13614681363105774, "epoch": 5.33, "learning_rate": 6.052330077982548e-08, "loss": 0.1718, "step": 14760, "task_loss": 0.23583835363388062 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04579289613553082, "compression/movement_sparsity/importance_threshold": -0.2616291407619843, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17683929204940796, "epoch": 5.34, "learning_rate": 5.987665778251739e-08, "loss": 0.1807, "step": 14770, "task_loss": 0.3152199983596802 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04583752574479569, "compression/movement_sparsity/importance_threshold": -0.2610212195258156, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17758771777153015, "epoch": 5.34, "learning_rate": 5.9233381169043415e-08, "loss": 0.1767, "step": 14780, "task_loss": 0.6586979031562805 }, { "compression/movement_sparsity/importance_regularization_factor": 0.045882086166245145, "compression/movement_sparsity/importance_threshold": -0.2604142407299451, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15100859105587006, "epoch": 5.35, "learning_rate": 5.8593473242842026e-08, "loss": 0.1851, "step": 14790, "task_loss": 0.5096355676651001 }, { "compression/movement_sparsity/importance_regularization_factor": 0.045926577453550624, "compression/movement_sparsity/importance_threshold": -0.2598082036432888, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15634649991989136, "epoch": 5.35, "learning_rate": 5.795693629528842e-08, "loss": 0.1731, "step": 14800, "task_loss": 0.28395843505859375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04597099966038353, "compression/movement_sparsity/importance_threshold": -0.25920310753476283, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17545104026794434, "epoch": 5.35, "learning_rate": 5.732377260568777e-08, "loss": 0.1671, "step": 14810, "task_loss": 0.3866184949874878 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04601535284041529, "compression/movement_sparsity/importance_threshold": -0.2585989516732832, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1806982159614563, "epoch": 5.36, "learning_rate": 5.669398444126605e-08, "loss": 0.176, "step": 14820, "task_loss": 0.5448100566864014 }, { "compression/movement_sparsity/importance_regularization_factor": 0.046059637047317276, "compression/movement_sparsity/importance_threshold": -0.257995735327766, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13397540152072906, "epoch": 5.36, "learning_rate": 5.606757405716189e-08, "loss": 0.1662, "step": 14830, "task_loss": 0.18283666670322418 }, { "compression/movement_sparsity/importance_regularization_factor": 0.046103852334760935, "compression/movement_sparsity/importance_threshold": -0.2573934577671272, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1665625274181366, "epoch": 5.36, "learning_rate": 5.544454369641927e-08, "loss": 0.178, "step": 14840, "task_loss": 0.37204509973526 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04614799875641766, "compression/movement_sparsity/importance_threshold": -0.25679211826028303, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15637090802192688, "epoch": 5.37, "learning_rate": 5.482489558997849e-08, "loss": 0.1857, "step": 14850, "task_loss": 0.6641359925270081 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04619207636595886, "compression/movement_sparsity/importance_threshold": -0.25619171607614943, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1537550389766693, "epoch": 5.37, "learning_rate": 5.420863195666925e-08, "loss": 0.1582, "step": 14860, "task_loss": 0.2819337844848633 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04623608521705596, "compression/movement_sparsity/importance_threshold": -0.25559225048364254, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1731978952884674, "epoch": 5.37, "learning_rate": 5.35957550032019e-08, "loss": 0.1711, "step": 14870, "task_loss": 0.3445536494255066 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04628002536338036, "compression/movement_sparsity/importance_threshold": -0.25499372075167837, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1405542939901352, "epoch": 5.38, "learning_rate": 5.298626692415975e-08, "loss": 0.1704, "step": 14880, "task_loss": 0.4534985423088074 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04632389685860346, "compression/movement_sparsity/importance_threshold": -0.2543961261491732, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1388224959373474, "epoch": 5.38, "learning_rate": 5.238016990199157e-08, "loss": 0.1652, "step": 14890, "task_loss": 0.3235897123813629 }, { "compression/movement_sparsity/importance_regularization_factor": 0.046367699756396714, "compression/movement_sparsity/importance_threshold": -0.25379946594504255, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1863054782152176, "epoch": 5.38, "learning_rate": 5.1777466107002844e-08, "loss": 0.1791, "step": 14900, "task_loss": 0.5134084224700928 }, { "compression/movement_sparsity/importance_regularization_factor": 0.046411434110431464, "compression/movement_sparsity/importance_threshold": -0.2532037394082032, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16694492101669312, "epoch": 5.39, "learning_rate": 5.117815769734946e-08, "loss": 0.1711, "step": 14910, "task_loss": 0.45068734884262085 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04645509997437919, "compression/movement_sparsity/importance_threshold": -0.25260894580757065, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1374826282262802, "epoch": 5.39, "learning_rate": 5.058224681902834e-08, "loss": 0.1673, "step": 14920, "task_loss": 0.1980714350938797 }, { "compression/movement_sparsity/importance_regularization_factor": 0.046498697401911246, "compression/movement_sparsity/importance_threshold": -0.25201508441206133, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1443094164133072, "epoch": 5.4, "learning_rate": 4.998973560587105e-08, "loss": 0.1678, "step": 14930, "task_loss": 0.2809217870235443 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04654222644669908, "compression/movement_sparsity/importance_threshold": -0.251422154490591, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1853853315114975, "epoch": 5.4, "learning_rate": 4.940062617953567e-08, "loss": 0.1701, "step": 14940, "task_loss": 0.3242225646972656 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04658568716241408, "compression/movement_sparsity/importance_threshold": -0.2508301553120761, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16819274425506592, "epoch": 5.4, "learning_rate": 4.881492064949888e-08, "loss": 0.1736, "step": 14950, "task_loss": 0.5310601592063904 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04662907960272768, "compression/movement_sparsity/importance_threshold": -0.25023908614543233, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15723347663879395, "epoch": 5.41, "learning_rate": 4.823262111304904e-08, "loss": 0.1747, "step": 14960, "task_loss": 0.5195972919464111 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04667240382131126, "compression/movement_sparsity/importance_threshold": -0.249648946259576, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1590065211057663, "epoch": 5.41, "learning_rate": 4.7653729655278254e-08, "loss": 0.1759, "step": 14970, "task_loss": 0.4165058135986328 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04671565987183626, "compression/movement_sparsity/importance_threshold": -0.24905973492342304, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17201870679855347, "epoch": 5.41, "learning_rate": 4.707824834907481e-08, "loss": 0.1829, "step": 14980, "task_loss": 0.3914673924446106 }, { "compression/movement_sparsity/importance_regularization_factor": 0.046758847807974076, "compression/movement_sparsity/importance_threshold": -0.24847145140588967, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18516141176223755, "epoch": 5.42, "learning_rate": 4.650617925511635e-08, "loss": 0.1768, "step": 14990, "task_loss": 0.47868090867996216 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04680196768339613, "compression/movement_sparsity/importance_threshold": -0.2478840949758917, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13710492849349976, "epoch": 5.42, "learning_rate": 4.5937524421861826e-08, "loss": 0.1677, "step": 15000, "task_loss": 0.3779285252094269 }, { "epoch": 5.42, "eval_exact_match": 83.49101229895932, "eval_f1": 89.94383123761723, "step": 15000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04684501955177381, "compression/movement_sparsity/importance_threshold": -0.2472976649023455, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17632843554019928, "epoch": 5.42, "learning_rate": 4.537228588554476e-08, "loss": 0.1732, "step": 15010, "task_loss": 0.6332323551177979 }, { "compression/movement_sparsity/importance_regularization_factor": 0.046888003466778544, "compression/movement_sparsity/importance_threshold": -0.24671216045416688, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16848604381084442, "epoch": 5.43, "learning_rate": 4.4810465670164886e-08, "loss": 0.1745, "step": 15020, "task_loss": 0.586732029914856 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04693091948208174, "compression/movement_sparsity/importance_threshold": -0.24612758090027198, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16656005382537842, "epoch": 5.43, "learning_rate": 4.425206578748275e-08, "loss": 0.1607, "step": 15030, "task_loss": 0.4396723508834839 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0469737676513548, "compression/movement_sparsity/importance_threshold": -0.24554392550957704, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1422872245311737, "epoch": 5.44, "learning_rate": 4.369708823701024e-08, "loss": 0.1739, "step": 15040, "task_loss": 0.4501122534275055 }, { "compression/movement_sparsity/importance_regularization_factor": 0.047016548028269156, "compression/movement_sparsity/importance_threshold": -0.24496119355099788, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15386220812797546, "epoch": 5.44, "learning_rate": 4.31455350060056e-08, "loss": 0.1732, "step": 15050, "task_loss": 0.2968456447124481 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04705926066649619, "compression/movement_sparsity/importance_threshold": -0.24437938429345074, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18353886902332306, "epoch": 5.44, "learning_rate": 4.259740806946477e-08, "loss": 0.1725, "step": 15060, "task_loss": 0.4033396244049072 }, { "compression/movement_sparsity/importance_regularization_factor": 0.047101905619707334, "compression/movement_sparsity/importance_threshold": -0.24379849700585166, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1291903555393219, "epoch": 5.45, "learning_rate": 4.205270939011474e-08, "loss": 0.1536, "step": 15070, "task_loss": 0.38127729296684265 }, { "compression/movement_sparsity/importance_regularization_factor": 0.047144482941573994, "compression/movement_sparsity/importance_threshold": -0.24321853095711654, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1640148162841797, "epoch": 5.45, "learning_rate": 4.151144091840708e-08, "loss": 0.1776, "step": 15080, "task_loss": 0.6958640813827515 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04718699268576758, "compression/movement_sparsity/importance_threshold": -0.24263948541616154, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17455732822418213, "epoch": 5.45, "learning_rate": 4.0973604592510094e-08, "loss": 0.184, "step": 15090, "task_loss": 0.2959747612476349 }, { "compression/movement_sparsity/importance_regularization_factor": 0.047229434905959496, "compression/movement_sparsity/importance_threshold": -0.2420613596519029, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13450655341148376, "epoch": 5.46, "learning_rate": 4.043920233830267e-08, "loss": 0.1819, "step": 15100, "task_loss": 0.2962379455566406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04727180965582116, "compression/movement_sparsity/importance_threshold": -0.24148415293325654, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15026503801345825, "epoch": 5.46, "learning_rate": 3.990823606936666e-08, "loss": 0.1767, "step": 15110, "task_loss": 0.7710201740264893 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04731411698902398, "compression/movement_sparsity/importance_threshold": -0.24090786452913848, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1360689401626587, "epoch": 5.46, "learning_rate": 3.938070768698054e-08, "loss": 0.1722, "step": 15120, "task_loss": 0.46412721276283264 }, { "compression/movement_sparsity/importance_regularization_factor": 0.047356356959239364, "compression/movement_sparsity/importance_threshold": -0.24033249370846488, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1195104569196701, "epoch": 5.47, "learning_rate": 3.885661908011273e-08, "loss": 0.1808, "step": 15130, "task_loss": 0.4792066216468811 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04739852962013873, "compression/movement_sparsity/importance_threshold": -0.23975803974015175, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1432269811630249, "epoch": 5.47, "learning_rate": 3.833597212541373e-08, "loss": 0.1799, "step": 15140, "task_loss": 0.41720786690711975 }, { "compression/movement_sparsity/importance_regularization_factor": 0.047440635025393474, "compression/movement_sparsity/importance_threshold": -0.23918450189311524, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1752137839794159, "epoch": 5.48, "learning_rate": 3.781876868721112e-08, "loss": 0.1808, "step": 15150, "task_loss": 0.42168617248535156 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04748267322867502, "compression/movement_sparsity/importance_threshold": -0.23861187943627127, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15821881592273712, "epoch": 5.48, "learning_rate": 3.7305010617501245e-08, "loss": 0.1747, "step": 15160, "task_loss": 0.25698161125183105 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04752464428365478, "compression/movement_sparsity/importance_threshold": -0.23804017163853597, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.166069895029068, "epoch": 5.48, "learning_rate": 3.679469975594385e-08, "loss": 0.1685, "step": 15170, "task_loss": 0.4625723361968994 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04756654824400414, "compression/movement_sparsity/importance_threshold": -0.2374693777688256, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17632624506950378, "epoch": 5.49, "learning_rate": 3.6287837929854795e-08, "loss": 0.1778, "step": 15180, "task_loss": 0.36426687240600586 }, { "compression/movement_sparsity/importance_regularization_factor": 0.047608385163394565, "compression/movement_sparsity/importance_threshold": -0.23689949709605584, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16632002592086792, "epoch": 5.49, "learning_rate": 3.578442695419925e-08, "loss": 0.1632, "step": 15190, "task_loss": 0.4585450291633606 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0476501550954974, "compression/movement_sparsity/importance_threshold": -0.2363305288891432, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20258350670337677, "epoch": 5.49, "learning_rate": 3.528446863158641e-08, "loss": 0.1867, "step": 15200, "task_loss": 0.41815778613090515 }, { "compression/movement_sparsity/importance_regularization_factor": 0.047691858093984116, "compression/movement_sparsity/importance_threshold": -0.2357624724170032, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19658106565475464, "epoch": 5.5, "learning_rate": 3.4787964752261536e-08, "loss": 0.1876, "step": 15210, "task_loss": 0.5894261002540588 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04773349421252606, "compression/movement_sparsity/importance_threshold": -0.2351953269485526, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1922135055065155, "epoch": 5.5, "learning_rate": 3.4294917094100484e-08, "loss": 0.1867, "step": 15220, "task_loss": 0.4709359407424927 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04777506350479469, "compression/movement_sparsity/importance_threshold": -0.23462909175270685, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14158789813518524, "epoch": 5.5, "learning_rate": 3.380532742260334e-08, "loss": 0.1907, "step": 15230, "task_loss": 0.7388123869895935 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04781656602446141, "compression/movement_sparsity/importance_threshold": -0.23406376609838242, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21245448291301727, "epoch": 5.51, "learning_rate": 3.331919749088763e-08, "loss": 0.1904, "step": 15240, "task_loss": 0.6762509346008301 }, { "compression/movement_sparsity/importance_regularization_factor": 0.047858001825197616, "compression/movement_sparsity/importance_threshold": -0.23349934925449511, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16487757861614227, "epoch": 5.51, "learning_rate": 3.283652903968237e-08, "loss": 0.1809, "step": 15250, "task_loss": 0.6355116367340088 }, { "epoch": 5.51, "eval_exact_match": 83.60454115421003, "eval_f1": 90.05225316670094, "step": 15250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04789937096067474, "compression/movement_sparsity/importance_threshold": -0.23293584048996108, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13231196999549866, "epoch": 5.51, "learning_rate": 3.235732379732148e-08, "loss": 0.1582, "step": 15260, "task_loss": 0.23886415362358093 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04794067348456416, "compression/movement_sparsity/importance_threshold": -0.23237323907369634, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17306077480316162, "epoch": 5.52, "learning_rate": 3.188158347973846e-08, "loss": 0.1806, "step": 15270, "task_loss": 0.5274963974952698 }, { "compression/movement_sparsity/importance_regularization_factor": 0.047981909450537305, "compression/movement_sparsity/importance_threshold": -0.23181154427461725, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1836722195148468, "epoch": 5.52, "learning_rate": 3.140930979045886e-08, "loss": 0.1796, "step": 15280, "task_loss": 0.443234384059906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0480230789122656, "compression/movement_sparsity/importance_threshold": -0.23125075536163953, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14876201748847961, "epoch": 5.53, "learning_rate": 3.094050442059559e-08, "loss": 0.1717, "step": 15290, "task_loss": 0.7542725801467896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04806418192342043, "compression/movement_sparsity/importance_threshold": -0.23069087160367951, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16534817218780518, "epoch": 5.53, "learning_rate": 3.047516904884206e-08, "loss": 0.1689, "step": 15300, "task_loss": 0.4024174213409424 }, { "compression/movement_sparsity/importance_regularization_factor": 0.048105218537673224, "compression/movement_sparsity/importance_threshold": -0.23013189226965303, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19905927777290344, "epoch": 5.53, "learning_rate": 3.0013305341466066e-08, "loss": 0.1737, "step": 15310, "task_loss": 0.5906176567077637 }, { "compression/movement_sparsity/importance_regularization_factor": 0.048146188808695395, "compression/movement_sparsity/importance_threshold": -0.22957381662847598, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.174982950091362, "epoch": 5.54, "learning_rate": 2.9554914952304665e-08, "loss": 0.1761, "step": 15320, "task_loss": 0.45582154393196106 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04818709279015833, "compression/movement_sparsity/importance_threshold": -0.22901664394906496, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17691166698932648, "epoch": 5.54, "learning_rate": 2.9099999522757103e-08, "loss": 0.1961, "step": 15330, "task_loss": 0.5763674974441528 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04822793053573346, "compression/movement_sparsity/importance_threshold": -0.22846037350033566, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19818875193595886, "epoch": 5.54, "learning_rate": 2.86485606817799e-08, "loss": 0.1801, "step": 15340, "task_loss": 0.6044281721115112 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04826870209909218, "compression/movement_sparsity/importance_threshold": -0.22790500455120444, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13256880640983582, "epoch": 5.55, "learning_rate": 2.820060004588054e-08, "loss": 0.1709, "step": 15350, "task_loss": 0.4946630001068115 }, { "compression/movement_sparsity/importance_regularization_factor": 0.048309407533905925, "compression/movement_sparsity/importance_threshold": -0.227350536370587, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.177134171128273, "epoch": 5.55, "learning_rate": 2.7756119219111805e-08, "loss": 0.1756, "step": 15360, "task_loss": 0.6153013706207275 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04835004689384608, "compression/movement_sparsity/importance_threshold": -0.2267969682273997, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15522846579551697, "epoch": 5.55, "learning_rate": 2.7315119793065998e-08, "loss": 0.1707, "step": 15370, "task_loss": 0.27729225158691406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04839062023258407, "compression/movement_sparsity/importance_threshold": -0.22624429939055846, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16957354545593262, "epoch": 5.56, "learning_rate": 2.687760334686917e-08, "loss": 0.1731, "step": 15380, "task_loss": 0.6803514361381531 }, { "compression/movement_sparsity/importance_regularization_factor": 0.048431127603791306, "compression/movement_sparsity/importance_threshold": -0.2256925291289793, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14271289110183716, "epoch": 5.56, "learning_rate": 2.6443571447175795e-08, "loss": 0.1669, "step": 15390, "task_loss": 0.5242533087730408 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04847156906113919, "compression/movement_sparsity/importance_threshold": -0.2251416567115785, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1309148520231247, "epoch": 5.57, "learning_rate": 2.6013025648162546e-08, "loss": 0.1711, "step": 15400, "task_loss": 0.47524943947792053 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04851194465829914, "compression/movement_sparsity/importance_threshold": -0.22459168140727181, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14253975450992584, "epoch": 5.57, "learning_rate": 2.558596749152342e-08, "loss": 0.1968, "step": 15410, "task_loss": 0.4356352686882019 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04855225444894257, "compression/movement_sparsity/importance_threshold": -0.22404260248497565, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17380890250205994, "epoch": 5.57, "learning_rate": 2.5162398506463957e-08, "loss": 0.1748, "step": 15420, "task_loss": 0.48908424377441406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04859249848674087, "compression/movement_sparsity/importance_threshold": -0.2234944192136059, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1803099513053894, "epoch": 5.58, "learning_rate": 2.4742320209695245e-08, "loss": 0.1778, "step": 15430, "task_loss": 0.35538819432258606 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04863267682536546, "compression/movement_sparsity/importance_threshold": -0.22294713086207874, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14477093517780304, "epoch": 5.58, "learning_rate": 2.4325734105429486e-08, "loss": 0.1798, "step": 15440, "task_loss": 0.3041430115699768 }, { "compression/movement_sparsity/importance_regularization_factor": 0.048672789518487775, "compression/movement_sparsity/importance_threshold": -0.22240073669930993, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16156978905200958, "epoch": 5.58, "learning_rate": 2.391264168537377e-08, "loss": 0.19, "step": 15450, "task_loss": 0.4806953966617584 }, { "compression/movement_sparsity/importance_regularization_factor": 0.048712836619779196, "compression/movement_sparsity/importance_threshold": -0.22185523599421586, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20278826355934143, "epoch": 5.59, "learning_rate": 2.350304442872497e-08, "loss": 0.1881, "step": 15460, "task_loss": 0.7880595326423645 }, { "compression/movement_sparsity/importance_regularization_factor": 0.048752818182911134, "compression/movement_sparsity/importance_threshold": -0.22131062801571266, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15064328908920288, "epoch": 5.59, "learning_rate": 2.309694380216487e-08, "loss": 0.1644, "step": 15470, "task_loss": 0.37797996401786804 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04879273426155503, "compression/movement_sparsity/importance_threshold": -0.2207669120327158, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15130218863487244, "epoch": 5.59, "learning_rate": 2.2694341259854366e-08, "loss": 0.1658, "step": 15480, "task_loss": 0.5249390602111816 }, { "compression/movement_sparsity/importance_regularization_factor": 0.048832584909382255, "compression/movement_sparsity/importance_threshold": -0.22022408731414223, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15106838941574097, "epoch": 5.6, "learning_rate": 2.2295238243428384e-08, "loss": 0.1679, "step": 15490, "task_loss": 0.43280619382858276 }, { "compression/movement_sparsity/importance_regularization_factor": 0.048872370180064253, "compression/movement_sparsity/importance_threshold": -0.21968215312890738, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.161295086145401, "epoch": 5.6, "learning_rate": 2.1899636181990644e-08, "loss": 0.1777, "step": 15500, "task_loss": 0.22472208738327026 }, { "epoch": 5.6, "eval_exact_match": 83.52885525070955, "eval_f1": 89.91861288595577, "step": 15500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04891209012727241, "compression/movement_sparsity/importance_threshold": -0.2191411087459274, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15865418314933777, "epoch": 5.61, "learning_rate": 2.1507536492109123e-08, "loss": 0.1714, "step": 15510, "task_loss": 0.3335818648338318 }, { "compression/movement_sparsity/importance_regularization_factor": 0.048951744804678156, "compression/movement_sparsity/importance_threshold": -0.21860095343411856, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16958612203598022, "epoch": 5.61, "learning_rate": 2.1118940577810274e-08, "loss": 0.1739, "step": 15520, "task_loss": 0.4664156436920166 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04899133426595289, "compression/movement_sparsity/importance_threshold": -0.21806168646239676, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1692367047071457, "epoch": 5.61, "learning_rate": 2.0733849830574135e-08, "loss": 0.1765, "step": 15530, "task_loss": 0.6259729862213135 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04903085856476802, "compression/movement_sparsity/importance_threshold": -0.21752330709967815, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1735900342464447, "epoch": 5.62, "learning_rate": 2.0352265629329678e-08, "loss": 0.192, "step": 15540, "task_loss": 0.343353807926178 }, { "compression/movement_sparsity/importance_regularization_factor": 0.049070317754794965, "compression/movement_sparsity/importance_threshold": -0.21698581461487876, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13655024766921997, "epoch": 5.62, "learning_rate": 1.997418934044959e-08, "loss": 0.1741, "step": 15550, "task_loss": 0.16692593693733215 }, { "compression/movement_sparsity/importance_regularization_factor": 0.049109711889705136, "compression/movement_sparsity/importance_threshold": -0.2164492082769146, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14855730533599854, "epoch": 5.62, "learning_rate": 1.95996223177457e-08, "loss": 0.1727, "step": 15560, "task_loss": 0.46668440103530884 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04914904102316992, "compression/movement_sparsity/importance_threshold": -0.21591348735470217, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14958634972572327, "epoch": 5.63, "learning_rate": 1.9228565902463356e-08, "loss": 0.1707, "step": 15570, "task_loss": 0.32905149459838867 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04918830520886077, "compression/movement_sparsity/importance_threshold": -0.2153786511171567, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15305137634277344, "epoch": 5.63, "learning_rate": 1.8861021423277722e-08, "loss": 0.1729, "step": 15580, "task_loss": 0.45828777551651 }, { "compression/movement_sparsity/importance_regularization_factor": 0.049227504500449056, "compression/movement_sparsity/importance_threshold": -0.21484469883319512, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1438327133655548, "epoch": 5.63, "learning_rate": 1.8496990196288143e-08, "loss": 0.1723, "step": 15590, "task_loss": 0.28205084800720215 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04926663895160622, "compression/movement_sparsity/importance_threshold": -0.2143116297717329, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19014272093772888, "epoch": 5.64, "learning_rate": 1.8136473525013907e-08, "loss": 0.1907, "step": 15600, "task_loss": 0.6441313624382019 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04930570861600365, "compression/movement_sparsity/importance_threshold": -0.2137794432016863, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1672857701778412, "epoch": 5.64, "learning_rate": 1.7779472700389265e-08, "loss": 0.1924, "step": 15610, "task_loss": 0.4105015695095062 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04934471354731277, "compression/movement_sparsity/importance_threshold": -0.21324813839197143, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1488228738307953, "epoch": 5.65, "learning_rate": 1.742598900075909e-08, "loss": 0.1778, "step": 15620, "task_loss": 0.41871628165245056 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04938365379920498, "compression/movement_sparsity/importance_threshold": -0.21271771461150446, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19621488451957703, "epoch": 5.65, "learning_rate": 1.7076023691874e-08, "loss": 0.1715, "step": 15630, "task_loss": 0.36311233043670654 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04942252942535171, "compression/movement_sparsity/importance_threshold": -0.2121881711292012, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14452539384365082, "epoch": 5.65, "learning_rate": 1.6729578026886347e-08, "loss": 0.1759, "step": 15640, "task_loss": 0.3132791519165039 }, { "compression/movement_sparsity/importance_regularization_factor": 0.049461340479424354, "compression/movement_sparsity/importance_threshold": -0.21165950721397775, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20849350094795227, "epoch": 5.66, "learning_rate": 1.6386653246344916e-08, "loss": 0.1801, "step": 15650, "task_loss": 0.4664916396141052 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04950008701509432, "compression/movement_sparsity/importance_threshold": -0.2111317221347504, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14356830716133118, "epoch": 5.66, "learning_rate": 1.6047250578191342e-08, "loss": 0.1875, "step": 15660, "task_loss": 0.33819547295570374 }, { "compression/movement_sparsity/importance_regularization_factor": 0.049538769086033024, "compression/movement_sparsity/importance_threshold": -0.21060481516043505, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17026101052761078, "epoch": 5.66, "learning_rate": 1.571137123775501e-08, "loss": 0.198, "step": 15670, "task_loss": 0.28901436924934387 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04957738674591188, "compression/movement_sparsity/importance_threshold": -0.21007878555994786, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18996986746788025, "epoch": 5.67, "learning_rate": 1.5379016427749193e-08, "loss": 0.1702, "step": 15680, "task_loss": 0.3736341595649719 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0496159400484023, "compression/movement_sparsity/importance_threshold": -0.20955363260220472, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16425611078739166, "epoch": 5.67, "learning_rate": 1.5050187338266574e-08, "loss": 0.1707, "step": 15690, "task_loss": 0.32322365045547485 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04965442904717568, "compression/movement_sparsity/importance_threshold": -0.2090293555561219, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17800205945968628, "epoch": 5.67, "learning_rate": 1.4724885146774834e-08, "loss": 0.1811, "step": 15700, "task_loss": 0.4337901771068573 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04969285379590344, "compression/movement_sparsity/importance_threshold": -0.20850595369061542, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15589317679405212, "epoch": 5.68, "learning_rate": 1.4403111018112645e-08, "loss": 0.1792, "step": 15710, "task_loss": 0.357605516910553 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04973121434825701, "compression/movement_sparsity/importance_threshold": -0.2079834262746012, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16989806294441223, "epoch": 5.68, "learning_rate": 1.408486610448567e-08, "loss": 0.1705, "step": 15720, "task_loss": 0.5997334718704224 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04976951075790777, "compression/movement_sparsity/importance_threshold": -0.2074617725769956, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15032757818698883, "epoch": 5.68, "learning_rate": 1.3770151545461683e-08, "loss": 0.1719, "step": 15730, "task_loss": 0.4771418273448944 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04980774307852714, "compression/movement_sparsity/importance_threshold": -0.20694099186671433, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15643621981143951, "epoch": 5.69, "learning_rate": 1.3458968467967457e-08, "loss": 0.1763, "step": 15740, "task_loss": 0.41822129487991333 }, { "compression/movement_sparsity/importance_regularization_factor": 0.049845911363786535, "compression/movement_sparsity/importance_threshold": -0.20642108341267373, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12617532908916473, "epoch": 5.69, "learning_rate": 1.3151317986283994e-08, "loss": 0.1741, "step": 15750, "task_loss": 0.5501689910888672 }, { "epoch": 5.69, "eval_exact_match": 83.65184484389782, "eval_f1": 89.99673268802798, "step": 15750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.049884015667357354, "compression/movement_sparsity/importance_threshold": -0.20590204648378985, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14693979918956757, "epoch": 5.7, "learning_rate": 1.2847201202043079e-08, "loss": 0.1932, "step": 15760, "task_loss": 0.44467893242836 }, { "compression/movement_sparsity/importance_regularization_factor": 0.049922056042911035, "compression/movement_sparsity/importance_threshold": -0.20538388034897848, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15097220242023468, "epoch": 5.7, "learning_rate": 1.254661920422273e-08, "loss": 0.1731, "step": 15770, "task_loss": 0.2182091921567917 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04996003254411896, "compression/movement_sparsity/importance_threshold": -0.20486658427715598, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17072612047195435, "epoch": 5.7, "learning_rate": 1.2249573069143981e-08, "loss": 0.1689, "step": 15780, "task_loss": 0.35801267623901367 }, { "compression/movement_sparsity/importance_regularization_factor": 0.04999794522465255, "compression/movement_sparsity/importance_threshold": -0.2043501575372384, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13755616545677185, "epoch": 5.71, "learning_rate": 1.1956063860466436e-08, "loss": 0.173, "step": 15790, "task_loss": 0.3809490203857422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05003579413818322, "compression/movement_sparsity/importance_threshold": -0.20383459939814164, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17082442343235016, "epoch": 5.71, "learning_rate": 1.1666092629184831e-08, "loss": 0.1782, "step": 15800, "task_loss": 0.571792721748352 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05007357933838238, "compression/movement_sparsity/importance_threshold": -0.20331990912878184, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18530184030532837, "epoch": 5.71, "learning_rate": 1.1379660413625037e-08, "loss": 0.183, "step": 15810, "task_loss": 0.5109858512878418 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05011130087892143, "compression/movement_sparsity/importance_threshold": -0.20280608599807504, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14845629036426544, "epoch": 5.72, "learning_rate": 1.1096768239440612e-08, "loss": 0.1628, "step": 15820, "task_loss": 0.2701829671859741 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0501489588134718, "compression/movement_sparsity/importance_threshold": -0.20229312927493737, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16749542951583862, "epoch": 5.72, "learning_rate": 1.081741711960893e-08, "loss": 0.1836, "step": 15830, "task_loss": 0.4017007350921631 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05018655319570488, "compression/movement_sparsity/importance_threshold": -0.20178103822828497, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1643616259098053, "epoch": 5.72, "learning_rate": 1.0541608054427386e-08, "loss": 0.1929, "step": 15840, "task_loss": 0.4987179636955261 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05022408407929209, "compression/movement_sparsity/importance_threshold": -0.20126981212703376, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16851231455802917, "epoch": 5.73, "learning_rate": 1.0269342031510531e-08, "loss": 0.1722, "step": 15850, "task_loss": 0.3005487620830536 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05026155151790483, "compression/movement_sparsity/importance_threshold": -0.2007594502400999, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17689800262451172, "epoch": 5.73, "learning_rate": 1.000062002578539e-08, "loss": 0.1869, "step": 15860, "task_loss": 0.1859908550977707 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05029895556521454, "compression/movement_sparsity/importance_threshold": -0.20024995183639926, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20160090923309326, "epoch": 5.74, "learning_rate": 9.73544299948903e-09, "loss": 0.1833, "step": 15870, "task_loss": 0.7696638107299805 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05033629627489259, "compression/movement_sparsity/importance_threshold": -0.19974131618484825, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17273235321044922, "epoch": 5.74, "learning_rate": 9.473811902164564e-09, "loss": 0.1757, "step": 15880, "task_loss": 0.5132439136505127 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05037357370061043, "compression/movement_sparsity/importance_threshold": -0.19923354255436265, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1815074384212494, "epoch": 5.74, "learning_rate": 9.215727670657813e-09, "loss": 0.1809, "step": 15890, "task_loss": 0.47850722074508667 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05041078789603944, "compression/movement_sparsity/importance_threshold": -0.1987266302138586, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15570800006389618, "epoch": 5.75, "learning_rate": 8.961191229114317e-09, "loss": 0.1657, "step": 15900, "task_loss": 0.38322216272354126 }, { "compression/movement_sparsity/importance_regularization_factor": 0.050447938914851054, "compression/movement_sparsity/importance_threshold": -0.19822057843225227, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14382407069206238, "epoch": 5.75, "learning_rate": 8.710203488975221e-09, "loss": 0.1711, "step": 15910, "task_loss": 0.3041207194328308 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05048502681071666, "compression/movement_sparsity/importance_threshold": -0.19771538647845965, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.10968972742557526, "epoch": 5.75, "learning_rate": 8.462765348974943e-09, "loss": 0.1669, "step": 15920, "task_loss": 0.47378888726234436 }, { "compression/movement_sparsity/importance_regularization_factor": 0.050522051637307684, "compression/movement_sparsity/importance_threshold": -0.1972110536213968, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17376708984375, "epoch": 5.76, "learning_rate": 8.218877695137294e-09, "loss": 0.1801, "step": 15930, "task_loss": 0.42871758341789246 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05055901344829554, "compression/movement_sparsity/importance_threshold": -0.19670757912997971, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17503592371940613, "epoch": 5.76, "learning_rate": 7.97854140077281e-09, "loss": 0.1781, "step": 15940, "task_loss": 0.3418278694152832 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05059591229735162, "compression/movement_sparsity/importance_threshold": -0.19620496227312456, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19280436635017395, "epoch": 5.76, "learning_rate": 7.741757326475195e-09, "loss": 0.1785, "step": 15950, "task_loss": 0.5458291172981262 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05063274823814735, "compression/movement_sparsity/importance_threshold": -0.19570320231974747, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1474490612745285, "epoch": 5.77, "learning_rate": 7.508526320118114e-09, "loss": 0.1703, "step": 15960, "task_loss": 0.31817954778671265 }, { "compression/movement_sparsity/importance_regularization_factor": 0.050669521324354136, "compression/movement_sparsity/importance_threshold": -0.19520229853876425, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22078892588615417, "epoch": 5.77, "learning_rate": 7.2788492168529556e-09, "loss": 0.1843, "step": 15970, "task_loss": 0.45727652311325073 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05070623160964339, "compression/movement_sparsity/importance_threshold": -0.19470225019909138, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14431482553482056, "epoch": 5.78, "learning_rate": 7.052726839105072e-09, "loss": 0.1673, "step": 15980, "task_loss": 0.7761343717575073 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05074287914768652, "compression/movement_sparsity/importance_threshold": -0.19420305656964454, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1601599156856537, "epoch": 5.78, "learning_rate": 6.830159996570883e-09, "loss": 0.1785, "step": 15990, "task_loss": 0.41845470666885376 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05077946399215494, "compression/movement_sparsity/importance_threshold": -0.19370471691933988, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18368986248970032, "epoch": 5.78, "learning_rate": 6.611149486215772e-09, "loss": 0.1674, "step": 16000, "task_loss": 0.35329797863960266 }, { "epoch": 5.78, "eval_exact_match": 83.65184484389782, "eval_f1": 90.04175695740012, "step": 16000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05081598619672006, "compression/movement_sparsity/importance_threshold": -0.19320723051709365, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13221392035484314, "epoch": 5.79, "learning_rate": 6.395696092269975e-09, "loss": 0.1698, "step": 16010, "task_loss": 0.43325942754745483 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05085244581505328, "compression/movement_sparsity/importance_threshold": -0.19271059663182188, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16434437036514282, "epoch": 5.79, "learning_rate": 6.183800586226917e-09, "loss": 0.1674, "step": 16020, "task_loss": 0.47905054688453674 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05088884290082603, "compression/movement_sparsity/importance_threshold": -0.19221481453244038, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12508782744407654, "epoch": 5.79, "learning_rate": 5.975463726839769e-09, "loss": 0.1762, "step": 16030, "task_loss": 0.16149437427520752 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05092517750770971, "compression/movement_sparsity/importance_threshold": -0.1917198834878654, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13816924393177032, "epoch": 5.8, "learning_rate": 5.7706862601188956e-09, "loss": 0.1769, "step": 16040, "task_loss": 0.4390493631362915 }, { "compression/movement_sparsity/importance_regularization_factor": 0.050961449689375715, "compression/movement_sparsity/importance_threshold": -0.19122580276701318, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15949416160583496, "epoch": 5.8, "learning_rate": 5.569468919329412e-09, "loss": 0.1825, "step": 16050, "task_loss": 0.376995712518692 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05099765949949549, "compression/movement_sparsity/importance_threshold": -0.19073257163879942, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17084449529647827, "epoch": 5.8, "learning_rate": 5.371812424988298e-09, "loss": 0.1777, "step": 16060, "task_loss": 0.5042399168014526 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05103380699174041, "compression/movement_sparsity/importance_threshold": -0.1902401893721405, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14627447724342346, "epoch": 5.81, "learning_rate": 5.177717484861843e-09, "loss": 0.1862, "step": 16070, "task_loss": 0.8221181631088257 }, { "compression/movement_sparsity/importance_regularization_factor": 0.051069892219781914, "compression/movement_sparsity/importance_threshold": -0.1897486552359523, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1746702343225479, "epoch": 5.81, "learning_rate": 4.987184793962984e-09, "loss": 0.1725, "step": 16080, "task_loss": 0.7324734926223755 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05110591523729141, "compression/movement_sparsity/importance_threshold": -0.18925796849915089, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1688476949930191, "epoch": 5.81, "learning_rate": 4.800215034549527e-09, "loss": 0.183, "step": 16090, "task_loss": 0.46025651693344116 }, { "compression/movement_sparsity/importance_regularization_factor": 0.051141876097940284, "compression/movement_sparsity/importance_threshold": -0.1887681284306526, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1825210154056549, "epoch": 5.82, "learning_rate": 4.616808876120592e-09, "loss": 0.1837, "step": 16100, "task_loss": 0.3381309509277344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.051177774855399975, "compression/movement_sparsity/importance_threshold": -0.18827913429937304, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17920146882534027, "epoch": 5.82, "learning_rate": 4.4369669754150686e-09, "loss": 0.188, "step": 16110, "task_loss": 0.5360010862350464 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05121361156334188, "compression/movement_sparsity/importance_threshold": -0.18779098537422856, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16270099580287933, "epoch": 5.83, "learning_rate": 4.260689976408938e-09, "loss": 0.1853, "step": 16120, "task_loss": 0.35809409618377686 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0512493862754374, "compression/movement_sparsity/importance_threshold": -0.1873036809241353, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15442490577697754, "epoch": 5.83, "learning_rate": 4.087978510313173e-09, "loss": 0.1841, "step": 16130, "task_loss": 0.3518342971801758 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05128509904535796, "compression/movement_sparsity/importance_threshold": -0.18681722021800928, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13016200065612793, "epoch": 5.83, "learning_rate": 3.91883319557107e-09, "loss": 0.1738, "step": 16140, "task_loss": 0.3382733464241028 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05132074992677497, "compression/movement_sparsity/importance_threshold": -0.18633160252476644, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2215452790260315, "epoch": 5.84, "learning_rate": 3.753254637856362e-09, "loss": 0.1911, "step": 16150, "task_loss": 0.3834210932254791 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05135633897335984, "compression/movement_sparsity/importance_threshold": -0.1858468271133228, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1495056003332138, "epoch": 5.84, "learning_rate": 3.5912434300711113e-09, "loss": 0.1791, "step": 16160, "task_loss": 0.2563254237174988 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05139186623878396, "compression/movement_sparsity/importance_threshold": -0.1853628932525948, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1634816825389862, "epoch": 5.84, "learning_rate": 3.4328001523432625e-09, "loss": 0.1836, "step": 16170, "task_loss": 0.49831482768058777 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05142733177671878, "compression/movement_sparsity/importance_threshold": -0.18487980021149808, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18057052791118622, "epoch": 5.85, "learning_rate": 3.277925372024981e-09, "loss": 0.1897, "step": 16180, "task_loss": 0.4678457975387573 }, { "compression/movement_sparsity/importance_regularization_factor": 0.051462735640835686, "compression/movement_sparsity/importance_threshold": -0.18439754725894886, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15450020134449005, "epoch": 5.85, "learning_rate": 3.1266196436902092e-09, "loss": 0.1728, "step": 16190, "task_loss": 0.3961532711982727 }, { "compression/movement_sparsity/importance_regularization_factor": 0.051498077884806086, "compression/movement_sparsity/importance_threshold": -0.18391613366386328, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1460452675819397, "epoch": 5.85, "learning_rate": 2.9788835091328902e-09, "loss": 0.1675, "step": 16200, "task_loss": 0.3941865563392639 }, { "compression/movement_sparsity/importance_regularization_factor": 0.051533358562301394, "compression/movement_sparsity/importance_threshold": -0.18343555869515726, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18922826647758484, "epoch": 5.86, "learning_rate": 2.834717497364969e-09, "loss": 0.1761, "step": 16210, "task_loss": 0.6385120153427124 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05156857772699304, "compression/movement_sparsity/importance_threshold": -0.18295582162174695, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16979092359542847, "epoch": 5.86, "learning_rate": 2.6941221246147283e-09, "loss": 0.1839, "step": 16220, "task_loss": 0.4651219844818115 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0516037354325524, "compression/movement_sparsity/importance_threshold": -0.1824769217125486, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21407416462898254, "epoch": 5.87, "learning_rate": 2.557097894324567e-09, "loss": 0.1886, "step": 16230, "task_loss": 0.60257887840271 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0516388317326509, "compression/movement_sparsity/importance_threshold": -0.181998858236478, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19913232326507568, "epoch": 5.87, "learning_rate": 2.4236452971493348e-09, "loss": 0.1795, "step": 16240, "task_loss": 0.44450339674949646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05167386668095997, "compression/movement_sparsity/importance_threshold": -0.1815216304624513, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17576679587364197, "epoch": 5.87, "learning_rate": 2.2937648109547793e-09, "loss": 0.1725, "step": 16250, "task_loss": 0.3722856938838959 }, { "epoch": 5.87, "eval_exact_match": 83.62346263008514, "eval_f1": 90.07747022728749, "step": 16250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05170884033115099, "compression/movement_sparsity/importance_threshold": -0.18104523765938463, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16510339081287384, "epoch": 5.88, "learning_rate": 2.167456900815545e-09, "loss": 0.1934, "step": 16260, "task_loss": 0.5060644149780273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.051743752736895396, "compression/movement_sparsity/importance_threshold": -0.18056967909619392, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15950855612754822, "epoch": 5.88, "learning_rate": 2.0447220190136225e-09, "loss": 0.1809, "step": 16270, "task_loss": 0.5934314727783203 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05177860395186458, "compression/movement_sparsity/importance_threshold": -0.18009495404179532, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18966922163963318, "epoch": 5.88, "learning_rate": 1.9255606050369024e-09, "loss": 0.1852, "step": 16280, "task_loss": 0.4227147698402405 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05181339402972995, "compression/movement_sparsity/importance_threshold": -0.17962106176510495, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18544501066207886, "epoch": 5.89, "learning_rate": 1.8099730855773986e-09, "loss": 0.1664, "step": 16290, "task_loss": 0.34008803963661194 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05184812302416294, "compression/movement_sparsity/importance_threshold": -0.17914800153503885, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16752862930297852, "epoch": 5.89, "learning_rate": 1.6979598745294754e-09, "loss": 0.1778, "step": 16300, "task_loss": 0.417269229888916 }, { "compression/movement_sparsity/importance_regularization_factor": 0.051882790988834936, "compression/movement_sparsity/importance_threshold": -0.17867577262051304, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15857253968715668, "epoch": 5.89, "learning_rate": 1.5895213729889555e-09, "loss": 0.1804, "step": 16310, "task_loss": 0.773322582244873 }, { "compression/movement_sparsity/importance_regularization_factor": 0.051917397977417366, "compression/movement_sparsity/importance_threshold": -0.17820437429044367, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1497032344341278, "epoch": 5.9, "learning_rate": 1.484657969251346e-09, "loss": 0.1761, "step": 16320, "task_loss": 0.3770208954811096 }, { "compression/movement_sparsity/importance_regularization_factor": 0.051951944043581624, "compression/movement_sparsity/importance_threshold": -0.17773380581374676, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11823238432407379, "epoch": 5.9, "learning_rate": 1.3833700388103943e-09, "loss": 0.1767, "step": 16330, "task_loss": 0.2198001593351364 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05198642924099913, "compression/movement_sparsity/importance_threshold": -0.17726406645933845, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16819697618484497, "epoch": 5.91, "learning_rate": 1.285657944356977e-09, "loss": 0.1708, "step": 16340, "task_loss": 0.35815563797950745 }, { "compression/movement_sparsity/importance_regularization_factor": 0.052020853623341315, "compression/movement_sparsity/importance_threshold": -0.17679515549613445, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15140169858932495, "epoch": 5.91, "learning_rate": 1.1915220357772149e-09, "loss": 0.1769, "step": 16350, "task_loss": 0.5908142328262329 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05205521724427954, "compression/movement_sparsity/importance_threshold": -0.17632707219305144, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14047864079475403, "epoch": 5.91, "learning_rate": 1.1009626501523595e-09, "loss": 0.1804, "step": 16360, "task_loss": 0.31691184639930725 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05208952015748527, "compression/movement_sparsity/importance_threshold": -0.175859815819005, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1407729983329773, "epoch": 5.92, "learning_rate": 1.0139801117562408e-09, "loss": 0.1801, "step": 16370, "task_loss": 0.5783429145812988 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05212376241662988, "compression/movement_sparsity/importance_threshold": -0.1753933856429113, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1462913155555725, "epoch": 5.92, "learning_rate": 9.30574732055156e-10, "loss": 0.179, "step": 16380, "task_loss": 0.5032361149787903 }, { "compression/movement_sparsity/importance_regularization_factor": 0.052157944075384786, "compression/movement_sparsity/importance_threshold": -0.17492778093368655, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16070584952831268, "epoch": 5.92, "learning_rate": 8.507468097062043e-10, "loss": 0.1658, "step": 16390, "task_loss": 0.27588170766830444 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05219206518742141, "compression/movement_sparsity/importance_threshold": -0.17446300096024658, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18269900977611542, "epoch": 5.93, "learning_rate": 7.744966305563982e-10, "loss": 0.1777, "step": 16400, "task_loss": 0.3831246495246887 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05222612580641116, "compression/movement_sparsity/importance_threshold": -0.17399904499150776, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1743420958518982, "epoch": 5.93, "learning_rate": 7.018244676415541e-10, "loss": 0.1687, "step": 16410, "task_loss": 0.460012823343277 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05226012598602544, "compression/movement_sparsity/importance_threshold": -0.17353591229638587, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15871578454971313, "epoch": 5.93, "learning_rate": 6.327305811852923e-10, "loss": 0.1759, "step": 16420, "task_loss": 0.3666446805000305 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05229406577993566, "compression/movement_sparsity/importance_threshold": -0.17307360214379708, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12524762749671936, "epoch": 5.94, "learning_rate": 5.672152185983714e-10, "loss": 0.1674, "step": 16430, "task_loss": 0.5719473361968994 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05232794524181323, "compression/movement_sparsity/importance_threshold": -0.17261211380265762, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13641473650932312, "epoch": 5.94, "learning_rate": 5.052786144775778e-10, "loss": 0.1772, "step": 16440, "task_loss": 0.491558313369751 }, { "compression/movement_sparsity/importance_regularization_factor": 0.052361764425329575, "compression/movement_sparsity/importance_threshold": -0.1721514465418833, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13365453481674194, "epoch": 5.95, "learning_rate": 4.469209906048377e-10, "loss": 0.1727, "step": 16450, "task_loss": 0.2934437394142151 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05239552338415609, "compression/movement_sparsity/importance_threshold": -0.17169159963039038, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14834952354431152, "epoch": 5.95, "learning_rate": 3.921425559463287e-10, "loss": 0.1874, "step": 16460, "task_loss": 0.4384889304637909 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05242922217196419, "compression/movement_sparsity/importance_threshold": -0.17123257233709488, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15647485852241516, "epoch": 5.95, "learning_rate": 3.4094350665236913e-10, "loss": 0.1962, "step": 16470, "task_loss": 0.5159170031547546 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05246286084242529, "compression/movement_sparsity/importance_threshold": -0.17077436393091272, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14214615523815155, "epoch": 5.96, "learning_rate": 2.933240260558634e-10, "loss": 0.1902, "step": 16480, "task_loss": 0.31043171882629395 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05249643944921079, "compression/movement_sparsity/importance_threshold": -0.17031697368076015, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15785396099090576, "epoch": 5.96, "learning_rate": 2.4928428467207997e-10, "loss": 0.1898, "step": 16490, "task_loss": 0.2698136568069458 }, { "compression/movement_sparsity/importance_regularization_factor": 0.052529958045992114, "compression/movement_sparsity/importance_threshold": -0.1698604008555531, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1571805626153946, "epoch": 5.96, "learning_rate": 2.0882444019809653e-10, "loss": 0.179, "step": 16500, "task_loss": 0.29651594161987305 }, { "epoch": 5.96, "eval_exact_match": 83.73699148533585, "eval_f1": 90.03229638831709, "step": 16500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05256341668644067, "compression/movement_sparsity/importance_threshold": -0.1694046447242078, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1439880132675171, "epoch": 5.97, "learning_rate": 1.719446375121336e-10, "loss": 0.1733, "step": 16510, "task_loss": 0.4038183093070984 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05259681542422786, "compression/movement_sparsity/importance_threshold": -0.16894970455564018, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14894336462020874, "epoch": 5.97, "learning_rate": 1.3864500867311057e-10, "loss": 0.1919, "step": 16520, "task_loss": 0.32719749212265015 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0526301543130251, "compression/movement_sparsity/importance_threshold": -0.16849557961876638, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13948574662208557, "epoch": 5.97, "learning_rate": 1.089256729197574e-10, "loss": 0.1815, "step": 16530, "task_loss": 0.6157231330871582 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0526634334065038, "compression/movement_sparsity/importance_threshold": -0.16804226918250242, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15811459720134735, "epoch": 5.98, "learning_rate": 8.278673667094783e-11, "loss": 0.169, "step": 16540, "task_loss": 0.3574429750442505 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05269665275833536, "compression/movement_sparsity/importance_threshold": -0.16758977251576435, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17332813143730164, "epoch": 5.98, "learning_rate": 6.022829352458902e-11, "loss": 0.1862, "step": 16550, "task_loss": 0.6019191741943359 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05272981242219122, "compression/movement_sparsity/importance_threshold": -0.16713808888746828, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15140379965305328, "epoch": 5.98, "learning_rate": 4.125042425784375e-11, "loss": 0.1758, "step": 16560, "task_loss": 0.3916362524032593 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05276291245174276, "compression/movement_sparsity/importance_threshold": -0.16668721756653027, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18484221398830414, "epoch": 5.99, "learning_rate": 2.5853196826353118e-11, "loss": 0.1708, "step": 16570, "task_loss": 0.5214443206787109 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05279595290066141, "compression/movement_sparsity/importance_threshold": -0.16623715782186643, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14860549569129944, "epoch": 5.99, "learning_rate": 1.403666636445866e-11, "loss": 0.1672, "step": 16580, "task_loss": 0.4291364848613739 }, { "compression/movement_sparsity/importance_regularization_factor": 0.052828933822618565, "compression/movement_sparsity/importance_threshold": -0.1657879089223927, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1612284928560257, "epoch": 6.0, "learning_rate": 5.8008751845362024e-12, "loss": 0.1767, "step": 16590, "task_loss": 0.4751649498939514 }, { "compression/movement_sparsity/importance_regularization_factor": 0.052861855271285646, "compression/movement_sparsity/importance_threshold": -0.16533947013702532, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15080714225769043, "epoch": 6.0, "learning_rate": 1.1458527773289262e-12, "loss": 0.1735, "step": 16600, "task_loss": 0.34060317277908325 }, { "compression/movement_sparsity/importance_regularization_factor": 0.052894717300334064, "compression/movement_sparsity/importance_threshold": -0.16489184073468022, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13875526189804077, "epoch": 6.0, "learning_rate": 1.9999999283841884e-06, "loss": 0.1644, "step": 16610, "task_loss": 0.36576491594314575 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05292751996343522, "compression/movement_sparsity/importance_threshold": -0.16444501998427363, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16185107827186584, "epoch": 6.01, "learning_rate": 1.9999974218318666e-06, "loss": 0.1683, "step": 16620, "task_loss": 0.30015283823013306 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05296026331426053, "compression/movement_sparsity/importance_threshold": -0.1639990071547215, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13772368431091309, "epoch": 6.01, "learning_rate": 1.999991334499232e-06, "loss": 0.1636, "step": 16630, "task_loss": 0.5743692517280579 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05299294740648142, "compression/movement_sparsity/importance_threshold": -0.1635538015149396, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17779430747032166, "epoch": 6.01, "learning_rate": 1.9999816664080824e-06, "loss": 0.168, "step": 16640, "task_loss": 0.47371160984039307 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053025572293769266, "compression/movement_sparsity/importance_threshold": -0.16310940233384463, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12829241156578064, "epoch": 6.02, "learning_rate": 1.999968417593037e-06, "loss": 0.1725, "step": 16650, "task_loss": 0.22890448570251465 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05305813802979552, "compression/movement_sparsity/importance_threshold": -0.1626658088803521, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14386427402496338, "epoch": 6.02, "learning_rate": 1.9999515881015373e-06, "loss": 0.1647, "step": 16660, "task_loss": 0.41270148754119873 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05309064466823156, "compression/movement_sparsity/importance_threshold": -0.16222302042337844, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13118350505828857, "epoch": 6.02, "learning_rate": 1.999931177993846e-06, "loss": 0.1577, "step": 16670, "task_loss": 0.9137722253799438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05312309226274881, "compression/movement_sparsity/importance_threshold": -0.16178103623183948, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14832520484924316, "epoch": 6.03, "learning_rate": 1.9999071873430475e-06, "loss": 0.157, "step": 16680, "task_loss": 0.44995182752609253 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05315548086701868, "compression/movement_sparsity/importance_threshold": -0.16133985557465147, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12737812101840973, "epoch": 6.03, "learning_rate": 1.9998796162350473e-06, "loss": 0.1638, "step": 16690, "task_loss": 0.2989009916782379 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053187810534712575, "compression/movement_sparsity/importance_threshold": -0.16089947772073032, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14384889602661133, "epoch": 6.04, "learning_rate": 1.999848464768571e-06, "loss": 0.1604, "step": 16700, "task_loss": 0.636667013168335 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053220081319501907, "compression/movement_sparsity/importance_threshold": -0.1604599019389923, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14905652403831482, "epoch": 6.04, "learning_rate": 1.999813733055167e-06, "loss": 0.1519, "step": 16710, "task_loss": 0.5293204188346863 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0532522932750581, "compression/movement_sparsity/importance_threshold": -0.16002112749835318, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14722052216529846, "epoch": 6.04, "learning_rate": 1.9997754212192007e-06, "loss": 0.1677, "step": 16720, "task_loss": 0.4709875285625458 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05328444645505254, "compression/movement_sparsity/importance_threshold": -0.15958315366772924, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16393886506557465, "epoch": 6.05, "learning_rate": 1.9997335293978595e-06, "loss": 0.166, "step": 16730, "task_loss": 0.4563778340816498 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05331654091315667, "compression/movement_sparsity/importance_threshold": -0.1591459797160365, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1607019305229187, "epoch": 6.05, "learning_rate": 1.999688057741149e-06, "loss": 0.1704, "step": 16740, "task_loss": 0.49112173914909363 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05334857670304187, "compression/movement_sparsity/importance_threshold": -0.1587096049121911, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1364935040473938, "epoch": 6.05, "learning_rate": 1.999639006411894e-06, "loss": 0.1494, "step": 16750, "task_loss": 0.3835960328578949 }, { "epoch": 6.05, "eval_exact_match": 83.57615894039735, "eval_f1": 89.92193376526602, "step": 16750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053380553878379575, "compression/movement_sparsity/importance_threshold": -0.15827402852510886, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1750296950340271, "epoch": 6.06, "learning_rate": 1.9995863755857365e-06, "loss": 0.174, "step": 16760, "task_loss": 0.39857983589172363 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05341247249284118, "compression/movement_sparsity/importance_threshold": -0.157839249823706, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15796607732772827, "epoch": 6.06, "learning_rate": 1.9995301654511367e-06, "loss": 0.1571, "step": 16770, "task_loss": 0.6707438826560974 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05344433260009809, "compression/movement_sparsity/importance_threshold": -0.1574052680768988, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14516857266426086, "epoch": 6.06, "learning_rate": 1.999470376209371e-06, "loss": 0.1627, "step": 16780, "task_loss": 0.5306075811386108 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053476134253821736, "compression/movement_sparsity/importance_threshold": -0.15697208255360307, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18241889774799347, "epoch": 6.07, "learning_rate": 1.9994070080745324e-06, "loss": 0.1769, "step": 16790, "task_loss": 0.6663118600845337 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05350787750768351, "compression/movement_sparsity/importance_threshold": -0.1565396925227348, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1332705169916153, "epoch": 6.07, "learning_rate": 1.9993400612735286e-06, "loss": 0.1623, "step": 16800, "task_loss": 0.518804132938385 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053539562415354826, "compression/movement_sparsity/importance_threshold": -0.1561080972532104, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13718751072883606, "epoch": 6.08, "learning_rate": 1.999269536046082e-06, "loss": 0.1616, "step": 16810, "task_loss": 0.29684680700302124 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0535711890305071, "compression/movement_sparsity/importance_threshold": -0.15567729601394564, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16599276661872864, "epoch": 6.08, "learning_rate": 1.9991954326447287e-06, "loss": 0.1692, "step": 16820, "task_loss": 0.32731759548187256 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05360275740681175, "compression/movement_sparsity/importance_threshold": -0.15524728807385668, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16123579442501068, "epoch": 6.08, "learning_rate": 1.9991177513348175e-06, "loss": 0.1853, "step": 16830, "task_loss": 0.48672449588775635 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053634267597940176, "compression/movement_sparsity/importance_threshold": -0.15481807270185954, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19667679071426392, "epoch": 6.09, "learning_rate": 1.9990364923945086e-06, "loss": 0.1792, "step": 16840, "task_loss": 0.5587581992149353 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05366571965756378, "compression/movement_sparsity/importance_threshold": -0.15438964916687037, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16758200526237488, "epoch": 6.09, "learning_rate": 1.9989516561147736e-06, "loss": 0.1634, "step": 16850, "task_loss": 0.39092981815338135 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053697113639354, "compression/movement_sparsity/importance_threshold": -0.15396201673780519, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16262221336364746, "epoch": 6.09, "learning_rate": 1.9988632427993927e-06, "loss": 0.1754, "step": 16860, "task_loss": 0.632863461971283 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05372844959698221, "compression/movement_sparsity/importance_threshold": -0.15353517468358013, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13204121589660645, "epoch": 6.1, "learning_rate": 1.9987712527649556e-06, "loss": 0.1688, "step": 16870, "task_loss": 0.37428024411201477 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05375972758411986, "compression/movement_sparsity/importance_threshold": -0.15310912227311113, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16641512513160706, "epoch": 6.1, "learning_rate": 1.9986756863408597e-06, "loss": 0.1674, "step": 16880, "task_loss": 0.3117241859436035 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05379094765443833, "compression/movement_sparsity/importance_threshold": -0.15268385877531432, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15495559573173523, "epoch": 6.1, "learning_rate": 1.9985765438693077e-06, "loss": 0.1739, "step": 16890, "task_loss": 0.7027961015701294 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053822109861609034, "compression/movement_sparsity/importance_threshold": -0.15225938345910583, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15895962715148926, "epoch": 6.11, "learning_rate": 1.998473825705308e-06, "loss": 0.1733, "step": 16900, "task_loss": 0.5291743874549866 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0538532142593034, "compression/movement_sparsity/importance_threshold": -0.1518356955934017, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15101155638694763, "epoch": 6.11, "learning_rate": 1.9983675322166733e-06, "loss": 0.1672, "step": 16910, "task_loss": 0.5095721483230591 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053884260901192825, "compression/movement_sparsity/importance_threshold": -0.15141279444711797, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12188731133937836, "epoch": 6.11, "learning_rate": 1.9982576637840178e-06, "loss": 0.1629, "step": 16920, "task_loss": 0.5534610748291016 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053915249840948726, "compression/movement_sparsity/importance_threshold": -0.15099067928917054, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15203392505645752, "epoch": 6.12, "learning_rate": 1.9981442208007564e-06, "loss": 0.1687, "step": 16930, "task_loss": 0.4714369475841522 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0539461811322425, "compression/movement_sparsity/importance_threshold": -0.15056934938847588, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1437663435935974, "epoch": 6.12, "learning_rate": 1.9980272036731065e-06, "loss": 0.1692, "step": 16940, "task_loss": 0.2952896058559418 }, { "compression/movement_sparsity/importance_regularization_factor": 0.053977054828745576, "compression/movement_sparsity/importance_threshold": -0.1501488040139497, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14852911233901978, "epoch": 6.13, "learning_rate": 1.9979066128200797e-06, "loss": 0.1635, "step": 16950, "task_loss": 0.7087572813034058 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05400787098412936, "compression/movement_sparsity/importance_threshold": -0.14972904243450824, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15441861748695374, "epoch": 6.13, "learning_rate": 1.997782448673488e-06, "loss": 0.1647, "step": 16960, "task_loss": 0.46169987320899963 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05403862965206526, "compression/movement_sparsity/importance_threshold": -0.14931006391906743, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13028943538665771, "epoch": 6.13, "learning_rate": 1.9976547116779365e-06, "loss": 0.1576, "step": 16970, "task_loss": 0.4005166292190552 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054069330886224676, "compression/movement_sparsity/importance_threshold": -0.1488918677365435, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1454225480556488, "epoch": 6.14, "learning_rate": 1.9975234022908244e-06, "loss": 0.1506, "step": 16980, "task_loss": 0.6841634511947632 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054099974740279035, "compression/movement_sparsity/importance_threshold": -0.1484744531558524, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15946821868419647, "epoch": 6.14, "learning_rate": 1.997388520982343e-06, "loss": 0.1786, "step": 16990, "task_loss": 0.5624659061431885 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054130561267899745, "compression/movement_sparsity/importance_threshold": -0.14805781944591023, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13070663809776306, "epoch": 6.14, "learning_rate": 1.997250068235474e-06, "loss": 0.1544, "step": 17000, "task_loss": 0.46055838465690613 }, { "epoch": 6.14, "eval_exact_match": 83.52885525070955, "eval_f1": 89.92910145189475, "step": 17000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05416109052275821, "compression/movement_sparsity/importance_threshold": -0.14764196587563316, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14563003182411194, "epoch": 6.15, "learning_rate": 1.9971080445459876e-06, "loss": 0.1777, "step": 17010, "task_loss": 0.31575658917427063 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054191562558525835, "compression/movement_sparsity/importance_threshold": -0.1472268917139371, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12489091604948044, "epoch": 6.15, "learning_rate": 1.9969624504224404e-06, "loss": 0.1681, "step": 17020, "task_loss": 0.3158435821533203 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05422197742887405, "compression/movement_sparsity/importance_threshold": -0.14681259622973808, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1576835811138153, "epoch": 6.15, "learning_rate": 1.996813286386175e-06, "loss": 0.1627, "step": 17030, "task_loss": 0.5685656070709229 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05425233518747425, "compression/movement_sparsity/importance_threshold": -0.14639907869195234, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15400083363056183, "epoch": 6.16, "learning_rate": 1.9966605529713155e-06, "loss": 0.1656, "step": 17040, "task_loss": 0.24658185243606567 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05428263588799785, "compression/movement_sparsity/importance_threshold": -0.14598633836949582, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13884764909744263, "epoch": 6.16, "learning_rate": 1.996504250724769e-06, "loss": 0.1629, "step": 17050, "task_loss": 0.5981327295303345 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054312879584116264, "compression/movement_sparsity/importance_threshold": -0.14557437453128474, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1670169234275818, "epoch": 6.17, "learning_rate": 1.9963443802062207e-06, "loss": 0.1626, "step": 17060, "task_loss": 0.34697288274765015 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054343066329500894, "compression/movement_sparsity/importance_threshold": -0.14516318644623505, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16179323196411133, "epoch": 6.17, "learning_rate": 1.996180941988133e-06, "loss": 0.1609, "step": 17070, "task_loss": 0.5075516700744629 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05437319617782316, "compression/movement_sparsity/importance_threshold": -0.14475277338326276, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1471652388572693, "epoch": 6.17, "learning_rate": 1.9960139366557434e-06, "loss": 0.1641, "step": 17080, "task_loss": 0.41368719935417175 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05440326918275447, "compression/movement_sparsity/importance_threshold": -0.144343134611284, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1576196253299713, "epoch": 6.18, "learning_rate": 1.995843364807064e-06, "loss": 0.179, "step": 17090, "task_loss": 0.5158824324607849 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05443328539796623, "compression/movement_sparsity/importance_threshold": -0.14393426939921494, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1396743804216385, "epoch": 6.18, "learning_rate": 1.9956692270528757e-06, "loss": 0.1645, "step": 17100, "task_loss": 0.4924278259277344 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054463244877129856, "compression/movement_sparsity/importance_threshold": -0.14352617701597148, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18293413519859314, "epoch": 6.18, "learning_rate": 1.9954915240167297e-06, "loss": 0.1677, "step": 17110, "task_loss": 0.60299152135849 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05449314767391675, "compression/movement_sparsity/importance_threshold": -0.14311885673046976, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.167672261595726, "epoch": 6.19, "learning_rate": 1.995310256334943e-06, "loss": 0.1653, "step": 17120, "task_loss": 0.39202722907066345 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05452299384199834, "compression/movement_sparsity/importance_threshold": -0.1427123078116258, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18624365329742432, "epoch": 6.19, "learning_rate": 1.995125424656597e-06, "loss": 0.1685, "step": 17130, "task_loss": 0.6348949074745178 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054552783435046014, "compression/movement_sparsity/importance_threshold": -0.14230652952835587, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14822807908058167, "epoch": 6.19, "learning_rate": 1.9949370296435347e-06, "loss": 0.1655, "step": 17140, "task_loss": 0.44887471199035645 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0545825165067312, "compression/movement_sparsity/importance_threshold": -0.14190152114957577, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1382816731929779, "epoch": 6.2, "learning_rate": 1.99474507197036e-06, "loss": 0.1745, "step": 17150, "task_loss": 0.2659206986427307 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05461219311072531, "compression/movement_sparsity/importance_threshold": -0.14149728194420164, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1857585608959198, "epoch": 6.2, "learning_rate": 1.9945495523244317e-06, "loss": 0.1702, "step": 17160, "task_loss": 0.5841749906539917 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05464181330069973, "compression/movement_sparsity/importance_threshold": -0.1410938111811496, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14488109946250916, "epoch": 6.21, "learning_rate": 1.994350471405865e-06, "loss": 0.1559, "step": 17170, "task_loss": 0.7074770927429199 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054671377130325896, "compression/movement_sparsity/importance_threshold": -0.14069110812933583, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13776680827140808, "epoch": 6.21, "learning_rate": 1.994147829927527e-06, "loss": 0.1737, "step": 17180, "task_loss": 0.48739925026893616 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05470088465327521, "compression/movement_sparsity/importance_threshold": -0.1402891720576761, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20274044573307037, "epoch": 6.21, "learning_rate": 1.9939416286150343e-06, "loss": 0.1674, "step": 17190, "task_loss": 0.47881031036376953 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05473033592321909, "compression/movement_sparsity/importance_threshold": -0.13988800223508668, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1323615312576294, "epoch": 6.22, "learning_rate": 1.9937318682067498e-06, "loss": 0.1702, "step": 17200, "task_loss": 0.3447995185852051 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05475973099382892, "compression/movement_sparsity/importance_threshold": -0.13948759793048382, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15247386693954468, "epoch": 6.22, "learning_rate": 1.9935185494537817e-06, "loss": 0.1743, "step": 17210, "task_loss": 0.5565829277038574 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054789069918776155, "compression/movement_sparsity/importance_threshold": -0.1390879584127831, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16268417239189148, "epoch": 6.22, "learning_rate": 1.9933016731199798e-06, "loss": 0.1532, "step": 17220, "task_loss": 0.5147459506988525 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05481835275173216, "compression/movement_sparsity/importance_threshold": -0.138689082950901, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13939876854419708, "epoch": 6.23, "learning_rate": 1.993081239981932e-06, "loss": 0.1782, "step": 17230, "task_loss": 0.19769251346588135 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05484757954636838, "compression/movement_sparsity/importance_threshold": -0.1382909708137533, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1804233193397522, "epoch": 6.23, "learning_rate": 1.9928572508289638e-06, "loss": 0.1763, "step": 17240, "task_loss": 0.3552435040473938 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0548767503563562, "compression/movement_sparsity/importance_threshold": -0.1378936212702564, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12657399475574493, "epoch": 6.23, "learning_rate": 1.9926297064631324e-06, "loss": 0.1487, "step": 17250, "task_loss": 0.3169633448123932 }, { "epoch": 6.23, "eval_exact_match": 83.73699148533585, "eval_f1": 90.10480575796605, "step": 17250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05490586523536706, "compression/movement_sparsity/importance_threshold": -0.13749703358932597, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15277908742427826, "epoch": 6.24, "learning_rate": 1.9923986076992264e-06, "loss": 0.171, "step": 17260, "task_loss": 0.472973108291626 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05493492423707234, "compression/movement_sparsity/importance_threshold": -0.13710120703987838, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15695881843566895, "epoch": 6.24, "learning_rate": 1.9921639553647624e-06, "loss": 0.1665, "step": 17270, "task_loss": 0.6075320839881897 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05496392741514347, "compression/movement_sparsity/importance_threshold": -0.13670614089082955, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18700659275054932, "epoch": 6.25, "learning_rate": 1.991925750299981e-06, "loss": 0.1821, "step": 17280, "task_loss": 0.6988071799278259 }, { "compression/movement_sparsity/importance_regularization_factor": 0.054992874823251846, "compression/movement_sparsity/importance_threshold": -0.13631183441109584, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15885579586029053, "epoch": 6.25, "learning_rate": 1.9916839933578437e-06, "loss": 0.162, "step": 17290, "task_loss": 0.19446536898612976 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05502176651506889, "compression/movement_sparsity/importance_threshold": -0.13591828686959284, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19656088948249817, "epoch": 6.25, "learning_rate": 1.991438685404032e-06, "loss": 0.1763, "step": 17300, "task_loss": 0.423044353723526 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05505060254426602, "compression/movement_sparsity/importance_threshold": -0.1355254975352368, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16628031432628632, "epoch": 6.26, "learning_rate": 1.9911898273169412e-06, "loss": 0.1661, "step": 17310, "task_loss": 0.35148802399635315 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05507938296451462, "compression/movement_sparsity/importance_threshold": -0.13513346567694406, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16762763261795044, "epoch": 6.26, "learning_rate": 1.990937419987681e-06, "loss": 0.1662, "step": 17320, "task_loss": 0.6487798690795898 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05510810782948613, "compression/movement_sparsity/importance_threshold": -0.13474219056363024, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1576879620552063, "epoch": 6.26, "learning_rate": 1.9906814643200674e-06, "loss": 0.1753, "step": 17330, "task_loss": 0.31458860635757446 }, { "compression/movement_sparsity/importance_regularization_factor": 0.055136777192851934, "compression/movement_sparsity/importance_threshold": -0.13435167146421179, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12366840243339539, "epoch": 6.27, "learning_rate": 1.9904219612306246e-06, "loss": 0.1771, "step": 17340, "task_loss": 0.3803785443305969 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05516539110828347, "compression/movement_sparsity/importance_threshold": -0.13396190764760452, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16160424053668976, "epoch": 6.27, "learning_rate": 1.9901589116485788e-06, "loss": 0.1682, "step": 17350, "task_loss": 0.5825801491737366 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05519394962945213, "compression/movement_sparsity/importance_threshold": -0.13357289838272457, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.225599467754364, "epoch": 6.27, "learning_rate": 1.9898923165158548e-06, "loss": 0.1655, "step": 17360, "task_loss": 0.45244693756103516 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05522245281002932, "compression/movement_sparsity/importance_threshold": -0.1331846429384882, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16648636758327484, "epoch": 6.28, "learning_rate": 1.989622176787074e-06, "loss": 0.164, "step": 17370, "task_loss": 0.3290000557899475 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05525090070368647, "compression/movement_sparsity/importance_threshold": -0.1327971405838111, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18633422255516052, "epoch": 6.28, "learning_rate": 1.9893484934295492e-06, "loss": 0.1719, "step": 17380, "task_loss": 0.5563036799430847 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05527929336409498, "compression/movement_sparsity/importance_threshold": -0.13241039058760962, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1366187334060669, "epoch": 6.28, "learning_rate": 1.9890712674232838e-06, "loss": 0.1834, "step": 17390, "task_loss": 0.3646396994590759 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05530763084492626, "compression/movement_sparsity/importance_threshold": -0.1320243922187998, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16789717972278595, "epoch": 6.29, "learning_rate": 1.9887904997609654e-06, "loss": 0.1746, "step": 17400, "task_loss": 0.47817733883857727 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05533591319985172, "compression/movement_sparsity/importance_threshold": -0.13163914474629757, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21855862438678741, "epoch": 6.29, "learning_rate": 1.9885061914479633e-06, "loss": 0.1875, "step": 17410, "task_loss": 0.40912604331970215 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05536414048254276, "compression/movement_sparsity/importance_threshold": -0.13125464743901927, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1337123066186905, "epoch": 6.3, "learning_rate": 1.9882183435023266e-06, "loss": 0.166, "step": 17420, "task_loss": 0.3562582731246948 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05539231274667082, "compression/movement_sparsity/importance_threshold": -0.1308708995658806, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16915687918663025, "epoch": 6.3, "learning_rate": 1.987926956954778e-06, "loss": 0.1834, "step": 17430, "task_loss": 0.5122113227844238 }, { "compression/movement_sparsity/importance_regularization_factor": 0.055420430045907285, "compression/movement_sparsity/importance_threshold": -0.13048790039579783, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14700746536254883, "epoch": 6.3, "learning_rate": 1.9876320328487113e-06, "loss": 0.1639, "step": 17440, "task_loss": 0.47502970695495605 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05544849243392357, "compression/movement_sparsity/importance_threshold": -0.13010564919768708, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1848972737789154, "epoch": 6.31, "learning_rate": 1.9873335722401875e-06, "loss": 0.1724, "step": 17450, "task_loss": 0.7271729707717896 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0554764999643911, "compression/movement_sparsity/importance_threshold": -0.12972414524046427, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11952685564756393, "epoch": 6.31, "learning_rate": 1.9870315761979317e-06, "loss": 0.1617, "step": 17460, "task_loss": 0.1481960266828537 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05550445269098127, "compression/movement_sparsity/importance_threshold": -0.12934338779304566, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14205265045166016, "epoch": 6.31, "learning_rate": 1.9867260458033276e-06, "loss": 0.1649, "step": 17470, "task_loss": 0.2205222249031067 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0555323506673655, "compression/movement_sparsity/importance_threshold": -0.12896337612434705, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14286062121391296, "epoch": 6.32, "learning_rate": 1.986416982150416e-06, "loss": 0.1646, "step": 17480, "task_loss": 0.28470635414123535 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05556019394721519, "compression/movement_sparsity/importance_threshold": -0.1285841095032848, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1712353527545929, "epoch": 6.32, "learning_rate": 1.9861043863458876e-06, "loss": 0.1574, "step": 17490, "task_loss": 0.5386769771575928 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05558798258420176, "compression/movement_sparsity/importance_threshold": -0.12820558719877473, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13892130553722382, "epoch": 6.32, "learning_rate": 1.9857882595090833e-06, "loss": 0.1648, "step": 17500, "task_loss": 0.49137255549430847 }, { "epoch": 6.32, "eval_exact_match": 83.58561967833491, "eval_f1": 90.0004746665234, "step": 17500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05561571663199662, "compression/movement_sparsity/importance_threshold": -0.12782780847973296, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16053462028503418, "epoch": 6.33, "learning_rate": 1.985468602771986e-06, "loss": 0.1744, "step": 17510, "task_loss": 0.41019684076309204 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05564339614427117, "compression/movement_sparsity/importance_threshold": -0.12745077261507576, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1520998179912567, "epoch": 6.33, "learning_rate": 1.98514541727922e-06, "loss": 0.1696, "step": 17520, "task_loss": 0.5022592544555664 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05567102117469682, "compression/movement_sparsity/importance_threshold": -0.12707447887371903, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12280330061912537, "epoch": 6.34, "learning_rate": 1.984818704188044e-06, "loss": 0.1538, "step": 17530, "task_loss": 0.34392112493515015 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05569859177694501, "compression/movement_sparsity/importance_threshold": -0.12669892652457881, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2284255176782608, "epoch": 6.34, "learning_rate": 1.9844884646683487e-06, "loss": 0.1699, "step": 17540, "task_loss": 0.39971989393234253 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05572610800468712, "compression/movement_sparsity/importance_threshold": -0.12632411483657113, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15068349242210388, "epoch": 6.34, "learning_rate": 1.984154699902653e-06, "loss": 0.1684, "step": 17550, "task_loss": 0.36631715297698975 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05575356991159457, "compression/movement_sparsity/importance_threshold": -0.12595004307861224, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12671113014221191, "epoch": 6.35, "learning_rate": 1.983817411086097e-06, "loss": 0.1697, "step": 17560, "task_loss": 0.49995699524879456 }, { "compression/movement_sparsity/importance_regularization_factor": 0.055780977551338776, "compression/movement_sparsity/importance_threshold": -0.12557671051961794, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1240062266588211, "epoch": 6.35, "learning_rate": 1.9834765994264426e-06, "loss": 0.1615, "step": 17570, "task_loss": 0.3404150605201721 }, { "compression/movement_sparsity/importance_regularization_factor": 0.055808330977591136, "compression/movement_sparsity/importance_threshold": -0.1252041164285046, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15258857607841492, "epoch": 6.35, "learning_rate": 1.983132266144064e-06, "loss": 0.1698, "step": 17580, "task_loss": 0.7967875599861145 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05583563024402308, "compression/movement_sparsity/importance_threshold": -0.12483226007418802, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14428968727588654, "epoch": 6.36, "learning_rate": 1.9827844124719453e-06, "loss": 0.1595, "step": 17590, "task_loss": 0.44335755705833435 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05586287540430599, "compression/movement_sparsity/importance_threshold": -0.12446114072558456, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19588825106620789, "epoch": 6.36, "learning_rate": 1.9824330396556784e-06, "loss": 0.1669, "step": 17600, "task_loss": 0.3872692286968231 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0558900665121113, "compression/movement_sparsity/importance_threshold": -0.12409075765161004, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15780013799667358, "epoch": 6.36, "learning_rate": 1.982078148953455e-06, "loss": 0.1693, "step": 17610, "task_loss": 0.36154231429100037 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05591720362111042, "compression/movement_sparsity/importance_threshold": -0.12372111012118059, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18099361658096313, "epoch": 6.37, "learning_rate": 1.981719741636064e-06, "loss": 0.1772, "step": 17620, "task_loss": 0.3962492346763611 }, { "compression/movement_sparsity/importance_regularization_factor": 0.055944286784974755, "compression/movement_sparsity/importance_threshold": -0.12335219740321224, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1900426745414734, "epoch": 6.37, "learning_rate": 1.981357818986887e-06, "loss": 0.1792, "step": 17630, "task_loss": 0.5194408893585205 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0559713160573757, "compression/movement_sparsity/importance_threshold": -0.12298401876662124, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15779553353786469, "epoch": 6.38, "learning_rate": 1.9809923823018917e-06, "loss": 0.1569, "step": 17640, "task_loss": 0.47046637535095215 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05599829149198469, "compression/movement_sparsity/importance_threshold": -0.1226165734803234, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.134331613779068, "epoch": 6.38, "learning_rate": 1.980623432889631e-06, "loss": 0.1596, "step": 17650, "task_loss": 0.3972647786140442 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05602521314247313, "compression/movement_sparsity/importance_threshold": -0.12224986081323497, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18308016657829285, "epoch": 6.38, "learning_rate": 1.9802509720712354e-06, "loss": 0.1725, "step": 17660, "task_loss": 0.6477333307266235 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05605208106251243, "compression/movement_sparsity/importance_threshold": -0.12188388003427186, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13036686182022095, "epoch": 6.39, "learning_rate": 1.9798750011804076e-06, "loss": 0.1592, "step": 17670, "task_loss": 0.31419873237609863 }, { "compression/movement_sparsity/importance_regularization_factor": 0.056078895305773986, "compression/movement_sparsity/importance_threshold": -0.12151863041235034, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1570032238960266, "epoch": 6.39, "learning_rate": 1.979495521563421e-06, "loss": 0.1522, "step": 17680, "task_loss": 0.37213003635406494 }, { "compression/movement_sparsity/importance_regularization_factor": 0.056105655925929226, "compression/movement_sparsity/importance_threshold": -0.12115411121638642, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15974989533424377, "epoch": 6.39, "learning_rate": 1.9791125345791115e-06, "loss": 0.165, "step": 17690, "task_loss": 0.33860349655151367 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05613236297664955, "compression/movement_sparsity/importance_threshold": -0.12079032171529602, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13290277123451233, "epoch": 6.4, "learning_rate": 1.9787260415988757e-06, "loss": 0.1589, "step": 17700, "task_loss": 0.2330818772315979 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05615901651160638, "compression/movement_sparsity/importance_threshold": -0.12042726117799529, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15578773617744446, "epoch": 6.4, "learning_rate": 1.9783360440066637e-06, "loss": 0.1779, "step": 17710, "task_loss": 0.7083296775817871 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05618561658447112, "compression/movement_sparsity/importance_threshold": -0.12006492887340048, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15509989857673645, "epoch": 6.4, "learning_rate": 1.977942543198974e-06, "loss": 0.1666, "step": 17720, "task_loss": 0.4361785352230072 }, { "compression/movement_sparsity/importance_regularization_factor": 0.056212163248915176, "compression/movement_sparsity/importance_threshold": -0.11970332407042727, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17246806621551514, "epoch": 6.41, "learning_rate": 1.9775455405848506e-06, "loss": 0.171, "step": 17730, "task_loss": 0.756430983543396 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05623865655860996, "compression/movement_sparsity/importance_threshold": -0.11934244603799216, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14905114471912384, "epoch": 6.41, "learning_rate": 1.977145037585877e-06, "loss": 0.171, "step": 17740, "task_loss": 0.33792349696159363 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05626509656722689, "compression/movement_sparsity/importance_threshold": -0.11898229404501082, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13120922446250916, "epoch": 6.41, "learning_rate": 1.9767410356361683e-06, "loss": 0.1557, "step": 17750, "task_loss": 0.32760465145111084 }, { "epoch": 6.41, "eval_exact_match": 83.59508041627247, "eval_f1": 89.9605980268485, "step": 17750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05629148332843737, "compression/movement_sparsity/importance_threshold": -0.11862286736039962, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17200222611427307, "epoch": 6.42, "learning_rate": 1.9763335361823723e-06, "loss": 0.1676, "step": 17760, "task_loss": 0.4891778826713562 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05631781689591282, "compression/movement_sparsity/importance_threshold": -0.11826416525307437, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14608396589756012, "epoch": 6.42, "learning_rate": 1.975922540683658e-06, "loss": 0.1627, "step": 17770, "task_loss": 0.21372440457344055 }, { "compression/movement_sparsity/importance_regularization_factor": 0.056344097323324634, "compression/movement_sparsity/importance_threshold": -0.11790618699195143, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15041396021842957, "epoch": 6.43, "learning_rate": 1.975508050611714e-06, "loss": 0.1692, "step": 17780, "task_loss": 0.372314989566803 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05637032466434423, "compression/movement_sparsity/importance_threshold": -0.11754893184594672, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17189109325408936, "epoch": 6.43, "learning_rate": 1.975090067450742e-06, "loss": 0.186, "step": 17790, "task_loss": 0.3796417713165283 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05639649897264304, "compression/movement_sparsity/importance_threshold": -0.11719239908397616, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14438897371292114, "epoch": 6.43, "learning_rate": 1.9746685926974515e-06, "loss": 0.1726, "step": 17800, "task_loss": 0.4204254746437073 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05642262030189243, "compression/movement_sparsity/importance_threshold": -0.11683658797495611, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15841056406497955, "epoch": 6.44, "learning_rate": 1.9742436278610548e-06, "loss": 0.1613, "step": 17810, "task_loss": 0.23542027175426483 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05644868870576385, "compression/movement_sparsity/importance_threshold": -0.11648149778780237, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17361599206924438, "epoch": 6.44, "learning_rate": 1.9738151744632616e-06, "loss": 0.195, "step": 17820, "task_loss": 0.2202863097190857 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05647470423792869, "compression/movement_sparsity/importance_threshold": -0.11612712779143108, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14202536642551422, "epoch": 6.44, "learning_rate": 1.973383234038274e-06, "loss": 0.1661, "step": 17830, "task_loss": 0.30595502257347107 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05650066695205837, "compression/movement_sparsity/importance_threshold": -0.1157734772547585, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1708425134420395, "epoch": 6.45, "learning_rate": 1.972947808132779e-06, "loss": 0.1743, "step": 17840, "task_loss": 0.849858820438385 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0565265769018243, "compression/movement_sparsity/importance_threshold": -0.11542054544670044, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15541870892047882, "epoch": 6.45, "learning_rate": 1.972508898305946e-06, "loss": 0.1744, "step": 17850, "task_loss": 0.4322483241558075 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05655243414089788, "compression/movement_sparsity/importance_threshold": -0.11506833163617303, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15814724564552307, "epoch": 6.45, "learning_rate": 1.9720665061294173e-06, "loss": 0.171, "step": 17860, "task_loss": 0.2430177628993988 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05657823872295053, "compression/movement_sparsity/importance_threshold": -0.11471683509209252, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14094632863998413, "epoch": 6.46, "learning_rate": 1.9716206331873075e-06, "loss": 0.175, "step": 17870, "task_loss": 0.36099082231521606 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05660399070165367, "compression/movement_sparsity/importance_threshold": -0.11436605508337472, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15439045429229736, "epoch": 6.46, "learning_rate": 1.971171281076193e-06, "loss": 0.1787, "step": 17880, "task_loss": 0.5687848329544067 }, { "compression/movement_sparsity/importance_regularization_factor": 0.056629690130678687, "compression/movement_sparsity/importance_threshold": -0.11401599087893588, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16982746124267578, "epoch": 6.47, "learning_rate": 1.9707184514051093e-06, "loss": 0.1699, "step": 17890, "task_loss": 0.5424094200134277 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05665533706369701, "compression/movement_sparsity/importance_threshold": -0.11366664174769192, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1268695890903473, "epoch": 6.47, "learning_rate": 1.970262145795545e-06, "loss": 0.1744, "step": 17900, "task_loss": 0.22294151782989502 }, { "compression/movement_sparsity/importance_regularization_factor": 0.056680931554380046, "compression/movement_sparsity/importance_threshold": -0.11331800695855909, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15782135725021362, "epoch": 6.47, "learning_rate": 1.969802365881433e-06, "loss": 0.1716, "step": 17910, "task_loss": 0.6972410678863525 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0567064736563992, "compression/movement_sparsity/importance_threshold": -0.11297008578045331, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1635473221540451, "epoch": 6.48, "learning_rate": 1.969339113309149e-06, "loss": 0.1581, "step": 17920, "task_loss": 0.24740125238895416 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05673196342342589, "compression/movement_sparsity/importance_threshold": -0.1126228774822906, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16152726113796234, "epoch": 6.48, "learning_rate": 1.9688723897375036e-06, "loss": 0.1625, "step": 17930, "task_loss": 0.4657655954360962 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05675740090913152, "compression/movement_sparsity/importance_threshold": -0.11227638133298723, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.24781015515327454, "epoch": 6.48, "learning_rate": 1.968402196837735e-06, "loss": 0.1717, "step": 17940, "task_loss": 0.33787935972213745 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05678278616718751, "compression/movement_sparsity/importance_threshold": -0.11193059660145899, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15699215233325958, "epoch": 6.49, "learning_rate": 1.9679285362935054e-06, "loss": 0.1605, "step": 17950, "task_loss": 0.3983635902404785 }, { "compression/movement_sparsity/importance_regularization_factor": 0.056808119251265256, "compression/movement_sparsity/importance_threshold": -0.11158552255662224, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14720289409160614, "epoch": 6.49, "learning_rate": 1.967451409800893e-06, "loss": 0.1625, "step": 17960, "task_loss": 0.5441750288009644 }, { "compression/movement_sparsity/importance_regularization_factor": 0.056833400215036176, "compression/movement_sparsity/importance_threshold": -0.11124115846739302, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.10097290575504303, "epoch": 6.49, "learning_rate": 1.966970819068388e-06, "loss": 0.168, "step": 17970, "task_loss": 0.5651923418045044 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05685862911217168, "compression/movement_sparsity/importance_threshold": -0.11089750360268713, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12678371369838715, "epoch": 6.5, "learning_rate": 1.966486765816884e-06, "loss": 0.1623, "step": 17980, "task_loss": 0.2238824963569641 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05688380599634319, "compression/movement_sparsity/importance_threshold": -0.11055455723142082, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1631462424993515, "epoch": 6.5, "learning_rate": 1.9659992517796746e-06, "loss": 0.1727, "step": 17990, "task_loss": 0.4262044131755829 }, { "compression/movement_sparsity/importance_regularization_factor": 0.056908930921222095, "compression/movement_sparsity/importance_threshold": -0.11021231862251013, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15143990516662598, "epoch": 6.51, "learning_rate": 1.965508278702444e-06, "loss": 0.1656, "step": 18000, "task_loss": 0.20069824159145355 }, { "epoch": 6.51, "eval_exact_match": 83.56669820245979, "eval_f1": 89.97247850295564, "step": 18000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05693400394047982, "compression/movement_sparsity/importance_threshold": -0.10987078704487119, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16191574931144714, "epoch": 6.51, "learning_rate": 1.9650138483432644e-06, "loss": 0.1679, "step": 18010, "task_loss": 0.2866957187652588 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05695902510778778, "compression/movement_sparsity/importance_threshold": -0.10952996176741991, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17104315757751465, "epoch": 6.51, "learning_rate": 1.964515962472586e-06, "loss": 0.1615, "step": 18020, "task_loss": 0.278532475233078 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05698399447681737, "compression/movement_sparsity/importance_threshold": -0.10918984205907256, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17607031762599945, "epoch": 6.52, "learning_rate": 1.9640146228732343e-06, "loss": 0.1609, "step": 18030, "task_loss": 0.624789834022522 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05700891210124001, "compression/movement_sparsity/importance_threshold": -0.10885042718874505, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18854406476020813, "epoch": 6.52, "learning_rate": 1.9635098313404e-06, "loss": 0.1678, "step": 18040, "task_loss": 0.5639091730117798 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05703377803472711, "compression/movement_sparsity/importance_threshold": -0.10851171642535351, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18219125270843506, "epoch": 6.52, "learning_rate": 1.963001589681636e-06, "loss": 0.1618, "step": 18050, "task_loss": 0.41501083970069885 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057058592330950085, "compression/movement_sparsity/importance_threshold": -0.10817370903781398, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1165059357881546, "epoch": 6.53, "learning_rate": 1.962541223834976e-06, "loss": 0.1657, "step": 18060, "task_loss": 0.17311452329158783 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05708335504358033, "compression/movement_sparsity/importance_threshold": -0.10783640429504271, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16131412982940674, "epoch": 6.53, "learning_rate": 1.9620264319609926e-06, "loss": 0.1773, "step": 18070, "task_loss": 0.451940655708313 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05710806622628928, "compression/movement_sparsity/importance_threshold": -0.10749980146595539, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15116122364997864, "epoch": 6.53, "learning_rate": 1.96150819527282e-06, "loss": 0.1819, "step": 18080, "task_loss": 0.49812251329421997 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05713272593274832, "compression/movement_sparsity/importance_threshold": -0.10716389981946839, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1451411247253418, "epoch": 6.54, "learning_rate": 1.960986515626155e-06, "loss": 0.1655, "step": 18090, "task_loss": 0.2867373824119568 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05715733421662887, "compression/movement_sparsity/importance_threshold": -0.10682869862449773, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14213800430297852, "epoch": 6.54, "learning_rate": 1.960461394889023e-06, "loss": 0.1562, "step": 18100, "task_loss": 0.3731827139854431 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05718189113160235, "compression/movement_sparsity/importance_threshold": -0.10649419714995934, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1480969786643982, "epoch": 6.54, "learning_rate": 1.95993283494177e-06, "loss": 0.1588, "step": 18110, "task_loss": 0.45702794194221497 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05720639673134016, "compression/movement_sparsity/importance_threshold": -0.10616039466476945, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16650226712226868, "epoch": 6.55, "learning_rate": 1.9594008376770582e-06, "loss": 0.1623, "step": 18120, "task_loss": 0.3911457359790802 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05723085106951372, "compression/movement_sparsity/importance_threshold": -0.105827290437844, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1613444685935974, "epoch": 6.55, "learning_rate": 1.9588654049998583e-06, "loss": 0.1854, "step": 18130, "task_loss": 0.46520888805389404 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057255254199794436, "compression/movement_sparsity/importance_threshold": -0.10549488373809901, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15832461416721344, "epoch": 6.56, "learning_rate": 1.958326538827442e-06, "loss": 0.1664, "step": 18140, "task_loss": 0.6141856908798218 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0572796061758537, "compression/movement_sparsity/importance_threshold": -0.10516317383445084, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2309219241142273, "epoch": 6.56, "learning_rate": 1.9577842410893747e-06, "loss": 0.1814, "step": 18150, "task_loss": 0.7296000719070435 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05730390705136296, "compression/movement_sparsity/importance_threshold": -0.10483215999581541, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1386280059814453, "epoch": 6.56, "learning_rate": 1.9572385137275114e-06, "loss": 0.1704, "step": 18160, "task_loss": 0.33172696828842163 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05732815687999359, "compression/movement_sparsity/importance_threshold": -0.10450184149110864, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16742317378520966, "epoch": 6.57, "learning_rate": 1.9566893586959866e-06, "loss": 0.1627, "step": 18170, "task_loss": 0.7185513973236084 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05735235571541704, "compression/movement_sparsity/importance_threshold": -0.10417221758924655, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14879506826400757, "epoch": 6.57, "learning_rate": 1.956136777961209e-06, "loss": 0.1696, "step": 18180, "task_loss": 0.5774948596954346 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057376503611304676, "compression/movement_sparsity/importance_threshold": -0.10384328755914563, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1316477358341217, "epoch": 6.57, "learning_rate": 1.955580773501854e-06, "loss": 0.1585, "step": 18190, "task_loss": 0.15516257286071777 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05740060062132793, "compression/movement_sparsity/importance_threshold": -0.10351505066972155, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13433092832565308, "epoch": 6.58, "learning_rate": 1.955021347308856e-06, "loss": 0.1766, "step": 18200, "task_loss": 0.3908587694168091 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05742464679915822, "compression/movement_sparsity/importance_threshold": -0.10318750618989048, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12410083413124084, "epoch": 6.58, "learning_rate": 1.954458501385403e-06, "loss": 0.1743, "step": 18210, "task_loss": 0.5834453105926514 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05744864219846695, "compression/movement_sparsity/importance_threshold": -0.10286065338856853, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14193576574325562, "epoch": 6.58, "learning_rate": 1.953892237746928e-06, "loss": 0.1661, "step": 18220, "task_loss": 0.4615253806114197 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05747258687292553, "compression/movement_sparsity/importance_threshold": -0.10253449153467176, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17011338472366333, "epoch": 6.59, "learning_rate": 1.9533225584211015e-06, "loss": 0.1622, "step": 18230, "task_loss": 0.2667291462421417 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05749648087620537, "compression/movement_sparsity/importance_threshold": -0.10220901989711617, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.156767338514328, "epoch": 6.59, "learning_rate": 1.952749465447825e-06, "loss": 0.1639, "step": 18240, "task_loss": 0.37695831060409546 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057520324261977875, "compression/movement_sparsity/importance_threshold": -0.10188423774481792, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13007321953773499, "epoch": 6.6, "learning_rate": 1.9521729608792247e-06, "loss": 0.1629, "step": 18250, "task_loss": 0.50368332862854 }, { "epoch": 6.6, "eval_exact_match": 83.60454115421003, "eval_f1": 89.95005100815796, "step": 18250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05754411708391447, "compression/movement_sparsity/importance_threshold": -0.10156014434669303, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1553541123867035, "epoch": 6.6, "learning_rate": 1.9515930467796414e-06, "loss": 0.1703, "step": 18260, "task_loss": 0.3957485556602478 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05756785939568656, "compression/movement_sparsity/importance_threshold": -0.10123673897165752, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18168096244335175, "epoch": 6.6, "learning_rate": 1.9510097252256255e-06, "loss": 0.1604, "step": 18270, "task_loss": 0.38172927498817444 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057591551250965545, "compression/movement_sparsity/importance_threshold": -0.10091402088862766, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17573542892932892, "epoch": 6.61, "learning_rate": 1.9504229983059294e-06, "loss": 0.1685, "step": 18280, "task_loss": 0.23665164411067963 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05761519270342284, "compression/movement_sparsity/importance_threshold": -0.10059198936651925, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14210020005702972, "epoch": 6.61, "learning_rate": 1.949832868121498e-06, "loss": 0.168, "step": 18290, "task_loss": 0.3023013472557068 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05763878380672986, "compression/movement_sparsity/importance_threshold": -0.10027064367424865, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1647496223449707, "epoch": 6.61, "learning_rate": 1.9492393367854633e-06, "loss": 0.1695, "step": 18300, "task_loss": 0.5862131118774414 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05766232461455803, "compression/movement_sparsity/importance_threshold": -0.09994998308073144, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18408089876174927, "epoch": 6.62, "learning_rate": 1.9486424064231367e-06, "loss": 0.1789, "step": 18310, "task_loss": 0.5712409615516663 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05768581518057873, "compression/movement_sparsity/importance_threshold": -0.09963000685488421, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16597400605678558, "epoch": 6.62, "learning_rate": 1.9480420791719995e-06, "loss": 0.172, "step": 18320, "task_loss": 0.297050803899765 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05770925555846339, "compression/movement_sparsity/importance_threshold": -0.09931071426562277, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15080735087394714, "epoch": 6.62, "learning_rate": 1.9474383571816978e-06, "loss": 0.1865, "step": 18330, "task_loss": 0.6137457489967346 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05773264580188341, "compression/movement_sparsity/importance_threshold": -0.09899210458186325, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13079148530960083, "epoch": 6.63, "learning_rate": 1.9468312426140326e-06, "loss": 0.1616, "step": 18340, "task_loss": 0.3845100402832031 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057755985964510224, "compression/movement_sparsity/importance_threshold": -0.09867417707252157, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19262845814228058, "epoch": 6.63, "learning_rate": 1.9462207376429537e-06, "loss": 0.1708, "step": 18350, "task_loss": 0.3163360357284546 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05777927610001521, "compression/movement_sparsity/importance_threshold": -0.0983569310065141, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15635263919830322, "epoch": 6.64, "learning_rate": 1.9456068444545504e-06, "loss": 0.1548, "step": 18360, "task_loss": 0.3761829733848572 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0578025162620698, "compression/movement_sparsity/importance_threshold": -0.09804036565275664, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14289672672748566, "epoch": 6.64, "learning_rate": 1.944989565247046e-06, "loss": 0.167, "step": 18370, "task_loss": 0.3219255805015564 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057825706504345396, "compression/movement_sparsity/importance_threshold": -0.09772448028016545, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1459888368844986, "epoch": 6.64, "learning_rate": 1.944368902230786e-06, "loss": 0.1658, "step": 18380, "task_loss": 0.33508729934692383 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057848846880513415, "compression/movement_sparsity/importance_threshold": -0.09740927415765632, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15043103694915771, "epoch": 6.65, "learning_rate": 1.943744857628235e-06, "loss": 0.1693, "step": 18390, "task_loss": 0.3376499116420746 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057871937444245264, "compression/movement_sparsity/importance_threshold": -0.09709474655414563, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1579783409833908, "epoch": 6.65, "learning_rate": 1.9431174336739656e-06, "loss": 0.1664, "step": 18400, "task_loss": 0.4792447090148926 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05789497824921235, "compression/movement_sparsity/importance_threshold": -0.09678089673854928, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12249864637851715, "epoch": 6.65, "learning_rate": 1.9424866326146506e-06, "loss": 0.1631, "step": 18410, "task_loss": 0.4236283302307129 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05791796934908609, "compression/movement_sparsity/importance_threshold": -0.09646772397978332, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15789994597434998, "epoch": 6.66, "learning_rate": 1.941852456709056e-06, "loss": 0.166, "step": 18420, "task_loss": 0.3791453540325165 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057940910797537895, "compression/movement_sparsity/importance_threshold": -0.09615522754676387, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11831511557102203, "epoch": 6.66, "learning_rate": 1.9412149082280315e-06, "loss": 0.1675, "step": 18430, "task_loss": 0.28735044598579407 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057963802648239166, "compression/movement_sparsity/importance_threshold": -0.09584340670840708, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17035847902297974, "epoch": 6.66, "learning_rate": 1.9405739894545044e-06, "loss": 0.1814, "step": 18440, "task_loss": 0.3237995505332947 }, { "compression/movement_sparsity/importance_regularization_factor": 0.057986644954861326, "compression/movement_sparsity/importance_threshold": -0.09553226073362886, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14559730887413025, "epoch": 6.67, "learning_rate": 1.9399297026834707e-06, "loss": 0.1559, "step": 18450, "task_loss": 0.3083050549030304 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05800943777107577, "compression/movement_sparsity/importance_threshold": -0.09522178889134547, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1738000512123108, "epoch": 6.67, "learning_rate": 1.939282050221985e-06, "loss": 0.1549, "step": 18460, "task_loss": 0.7337498664855957 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058032181150553926, "compression/movement_sparsity/importance_threshold": -0.0949119904504726, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13351115584373474, "epoch": 6.68, "learning_rate": 1.9386310343891546e-06, "loss": 0.1679, "step": 18470, "task_loss": 0.33444127440452576 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05805487514696719, "compression/movement_sparsity/importance_threshold": -0.09460286467992685, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15732640027999878, "epoch": 6.68, "learning_rate": 1.9379766575161305e-06, "loss": 0.1589, "step": 18480, "task_loss": 0.35811203718185425 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05807751981398699, "compression/movement_sparsity/importance_threshold": -0.09429441084862367, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14750367403030396, "epoch": 6.68, "learning_rate": 1.937318921946098e-06, "loss": 0.1596, "step": 18490, "task_loss": 0.21471939980983734 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05810011520528472, "compression/movement_sparsity/importance_threshold": -0.09398662822547976, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16651684045791626, "epoch": 6.69, "learning_rate": 1.93665783003427e-06, "loss": 0.167, "step": 18500, "task_loss": 0.6148971319198608 }, { "epoch": 6.69, "eval_exact_match": 83.69914853358561, "eval_f1": 90.03513064764907, "step": 18500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058122661374531794, "compression/movement_sparsity/importance_threshold": -0.09367951607941072, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16546697914600372, "epoch": 6.69, "learning_rate": 1.935993384147878e-06, "loss": 0.1774, "step": 18510, "task_loss": 0.5665580630302429 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05814515837539962, "compression/movement_sparsity/importance_threshold": -0.0933730736793329, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14349332451820374, "epoch": 6.69, "learning_rate": 1.9353255866661615e-06, "loss": 0.1656, "step": 18520, "task_loss": 0.6033084988594055 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058167606261559625, "compression/movement_sparsity/importance_threshold": -0.09306730029416221, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14493785798549652, "epoch": 6.7, "learning_rate": 1.9346544399803647e-06, "loss": 0.1629, "step": 18530, "task_loss": 0.3323573172092438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05819000508668321, "compression/movement_sparsity/importance_threshold": -0.0927621951928147, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12968416512012482, "epoch": 6.7, "learning_rate": 1.933979946493721e-06, "loss": 0.16, "step": 18540, "task_loss": 0.27857789397239685 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05821235490444178, "compression/movement_sparsity/importance_threshold": -0.0924577576442066, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14049646258354187, "epoch": 6.7, "learning_rate": 1.93330210862145e-06, "loss": 0.1607, "step": 18550, "task_loss": 0.4084530472755432 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05823465576850675, "compression/movement_sparsity/importance_threshold": -0.09215398691725374, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15300214290618896, "epoch": 6.71, "learning_rate": 1.9326209287907472e-06, "loss": 0.1835, "step": 18560, "task_loss": 0.3902439773082733 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05825690773254952, "compression/movement_sparsity/importance_threshold": -0.09185088228087246, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1685321033000946, "epoch": 6.71, "learning_rate": 1.9319364094407734e-06, "loss": 0.1601, "step": 18570, "task_loss": 0.27366265654563904 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05827911085024152, "compression/movement_sparsity/importance_threshold": -0.09154844300397869, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18406498432159424, "epoch": 6.71, "learning_rate": 1.931248553022649e-06, "loss": 0.1699, "step": 18580, "task_loss": 0.5142942070960999 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05830126517525415, "compression/movement_sparsity/importance_threshold": -0.09124666835548845, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15226216614246368, "epoch": 6.72, "learning_rate": 1.9305573619994426e-06, "loss": 0.1693, "step": 18590, "task_loss": 0.43030428886413574 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058323370761258826, "compression/movement_sparsity/importance_threshold": -0.09094555760431777, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14825284481048584, "epoch": 6.72, "learning_rate": 1.929862838846164e-06, "loss": 0.1628, "step": 18600, "task_loss": 0.4481026530265808 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05834542766192694, "compression/movement_sparsity/importance_threshold": -0.09064511001938291, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1773342490196228, "epoch": 6.73, "learning_rate": 1.929164986049754e-06, "loss": 0.1774, "step": 18610, "task_loss": 0.4238778352737427 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05836743593092993, "compression/movement_sparsity/importance_threshold": -0.09034532486959979, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16140668094158173, "epoch": 6.73, "learning_rate": 1.928463806109077e-06, "loss": 0.1705, "step": 18620, "task_loss": 0.650934100151062 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05838939562193919, "compression/movement_sparsity/importance_threshold": -0.09004620142388442, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17616376280784607, "epoch": 6.73, "learning_rate": 1.9277593015349107e-06, "loss": 0.1894, "step": 18630, "task_loss": 0.4096485376358032 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05841130678862614, "compression/movement_sparsity/importance_threshold": -0.08974773895115307, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1400645673274994, "epoch": 6.74, "learning_rate": 1.927051474849938e-06, "loss": 0.1758, "step": 18640, "task_loss": 0.45845314860343933 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05843316948466218, "compression/movement_sparsity/importance_threshold": -0.08944993672032164, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16155488789081573, "epoch": 6.74, "learning_rate": 1.926340328588737e-06, "loss": 0.1759, "step": 18650, "task_loss": 0.44465121626853943 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05845498376371872, "compression/movement_sparsity/importance_threshold": -0.08915279400030629, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.21582606434822083, "epoch": 6.74, "learning_rate": 1.9256258652977727e-06, "loss": 0.1758, "step": 18660, "task_loss": 0.5472663640975952 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05847674967946718, "compression/movement_sparsity/importance_threshold": -0.08885631006002304, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13278156518936157, "epoch": 6.75, "learning_rate": 1.924908087535388e-06, "loss": 0.1683, "step": 18670, "task_loss": 0.3198769688606262 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05849846728557897, "compression/movement_sparsity/importance_threshold": -0.08856048416838802, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16864389181137085, "epoch": 6.75, "learning_rate": 1.924186997871794e-06, "loss": 0.1601, "step": 18680, "task_loss": 0.4776211082935333 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0585201366357255, "compression/movement_sparsity/importance_threshold": -0.08826531559431705, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13846157491207123, "epoch": 6.75, "learning_rate": 1.9234625988890605e-06, "loss": 0.1714, "step": 18690, "task_loss": 0.5768707990646362 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05854175778357817, "compression/movement_sparsity/importance_threshold": -0.0879708036067266, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18022726476192474, "epoch": 6.76, "learning_rate": 1.9227348931811093e-06, "loss": 0.165, "step": 18700, "task_loss": 0.46484941244125366 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0585633307828084, "compression/movement_sparsity/importance_threshold": -0.08767694747453247, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1907450407743454, "epoch": 6.76, "learning_rate": 1.922003883353699e-06, "loss": 0.1671, "step": 18710, "task_loss": 0.5632059574127197 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05858485568708759, "compression/movement_sparsity/importance_threshold": -0.08738374646665081, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15245582163333893, "epoch": 6.77, "learning_rate": 1.9212695720244245e-06, "loss": 0.1807, "step": 18720, "task_loss": 0.5414676070213318 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058606332550087174, "compression/movement_sparsity/importance_threshold": -0.08709119985199754, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1406639814376831, "epoch": 6.77, "learning_rate": 1.9205319618226984e-06, "loss": 0.1676, "step": 18730, "task_loss": 0.3787824213504791 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05862776142547855, "compression/movement_sparsity/importance_threshold": -0.08679930689948889, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1576058268547058, "epoch": 6.77, "learning_rate": 1.919791055389748e-06, "loss": 0.1666, "step": 18740, "task_loss": 0.5415380001068115 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05864914236693311, "compression/movement_sparsity/importance_threshold": -0.08650806687804102, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1747387945652008, "epoch": 6.78, "learning_rate": 1.919046855378604e-06, "loss": 0.1728, "step": 18750, "task_loss": 0.3638462424278259 }, { "epoch": 6.78, "eval_exact_match": 83.69914853358561, "eval_f1": 90.11730656848496, "step": 18750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058670475428122294, "compression/movement_sparsity/importance_threshold": -0.08621747905656973, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14378315210342407, "epoch": 6.78, "learning_rate": 1.918299364454089e-06, "loss": 0.1612, "step": 18760, "task_loss": 0.7437037825584412 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058691760662717495, "compression/movement_sparsity/importance_threshold": -0.08592754270399128, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16446062922477722, "epoch": 6.78, "learning_rate": 1.917548585292811e-06, "loss": 0.1757, "step": 18770, "task_loss": 0.5021160840988159 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05871299812439014, "compression/movement_sparsity/importance_threshold": -0.08563825708922157, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14091575145721436, "epoch": 6.79, "learning_rate": 1.9167945205831526e-06, "loss": 0.177, "step": 18780, "task_loss": 0.618760883808136 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05873418786681161, "compression/movement_sparsity/importance_threshold": -0.08534962148117686, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13678541779518127, "epoch": 6.79, "learning_rate": 1.9160371730252607e-06, "loss": 0.1767, "step": 18790, "task_loss": 0.43866440653800964 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058755329943653345, "compression/movement_sparsity/importance_threshold": -0.08506163514877318, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17583860456943512, "epoch": 6.79, "learning_rate": 1.9152765453310366e-06, "loss": 0.1779, "step": 18800, "task_loss": 0.7005617618560791 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05877642440858674, "compression/movement_sparsity/importance_threshold": -0.08477429736092645, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14417672157287598, "epoch": 6.8, "learning_rate": 1.9145126402241293e-06, "loss": 0.1687, "step": 18810, "task_loss": 0.5040542483329773 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05879747131528321, "compression/movement_sparsity/importance_threshold": -0.08448760738655292, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17511731386184692, "epoch": 6.8, "learning_rate": 1.9137454604399215e-06, "loss": 0.1786, "step": 18820, "task_loss": 0.6716142892837524 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05881847071741417, "compression/movement_sparsity/importance_threshold": -0.0842015644945685, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16290408372879028, "epoch": 6.81, "learning_rate": 1.9129750087255232e-06, "loss": 0.1747, "step": 18830, "task_loss": 0.4927728772163391 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05883942266865102, "compression/movement_sparsity/importance_threshold": -0.08391616795388934, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16758540272712708, "epoch": 6.81, "learning_rate": 1.9122012878397593e-06, "loss": 0.1568, "step": 18840, "task_loss": 0.3878645896911621 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058860327222665175, "compression/movement_sparsity/importance_threshold": -0.08363141703343147, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14906227588653564, "epoch": 6.81, "learning_rate": 1.911424300553161e-06, "loss": 0.1536, "step": 18850, "task_loss": 0.20304948091506958 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05888118443312805, "compression/movement_sparsity/importance_threshold": -0.08334731100211101, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1479981690645218, "epoch": 6.82, "learning_rate": 1.9106440496479573e-06, "loss": 0.1643, "step": 18860, "task_loss": 0.6089077591896057 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05890199435371106, "compression/movement_sparsity/importance_threshold": -0.08306384912884401, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.143125981092453, "epoch": 6.82, "learning_rate": 1.9098605379180613e-06, "loss": 0.1653, "step": 18870, "task_loss": 0.32572343945503235 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058922757038085596, "compression/movement_sparsity/importance_threshold": -0.08278103068254661, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13448187708854675, "epoch": 6.82, "learning_rate": 1.909073768169065e-06, "loss": 0.1586, "step": 18880, "task_loss": 0.4831852912902832 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058943472539923096, "compression/movement_sparsity/importance_threshold": -0.0824988549321346, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14394140243530273, "epoch": 6.83, "learning_rate": 1.908283743218224e-06, "loss": 0.1585, "step": 18890, "task_loss": 0.41934144496917725 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05896414091289494, "compression/movement_sparsity/importance_threshold": -0.08221732114652447, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14577503502368927, "epoch": 6.83, "learning_rate": 1.9074904658944524e-06, "loss": 0.1639, "step": 18900, "task_loss": 0.29828184843063354 }, { "compression/movement_sparsity/importance_regularization_factor": 0.058984762210672566, "compression/movement_sparsity/importance_threshold": -0.0819364285946318, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1624283492565155, "epoch": 6.83, "learning_rate": 1.9066939390383086e-06, "loss": 0.1667, "step": 18910, "task_loss": 0.37106114625930786 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059005336486927365, "compression/movement_sparsity/importance_threshold": -0.08165617654537305, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1468050479888916, "epoch": 6.84, "learning_rate": 1.905894165501988e-06, "loss": 0.163, "step": 18920, "task_loss": 0.7496539354324341 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05902586379533076, "compression/movement_sparsity/importance_threshold": -0.08137656426766415, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15505670011043549, "epoch": 6.84, "learning_rate": 1.9050911481493112e-06, "loss": 0.163, "step": 18930, "task_loss": 0.45304474234580994 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05904634418955415, "compression/movement_sparsity/importance_threshold": -0.08109759103042113, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17738257348537445, "epoch": 6.84, "learning_rate": 1.9042848898557145e-06, "loss": 0.1714, "step": 18940, "task_loss": 0.4849020838737488 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05906677772326896, "compression/movement_sparsity/importance_threshold": -0.08081925610256013, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12414862960577011, "epoch": 6.85, "learning_rate": 1.903475393508239e-06, "loss": 0.1536, "step": 18950, "task_loss": 0.5212274789810181 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05908716445014659, "compression/movement_sparsity/importance_threshold": -0.08054155875299718, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16516438126564026, "epoch": 6.85, "learning_rate": 1.9026626620055208e-06, "loss": 0.1734, "step": 18960, "task_loss": 0.3675074577331543 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059107504423858453, "compression/movement_sparsity/importance_threshold": -0.0802644982506483, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13828526437282562, "epoch": 6.86, "learning_rate": 1.9018466982577802e-06, "loss": 0.1617, "step": 18970, "task_loss": 0.44366180896759033 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05912779769807596, "compression/movement_sparsity/importance_threshold": -0.07998807386442963, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1504247784614563, "epoch": 6.86, "learning_rate": 1.9010275051868123e-06, "loss": 0.1575, "step": 18980, "task_loss": 0.5267570614814758 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05914804432647053, "compression/movement_sparsity/importance_threshold": -0.07971228486325721, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1422831118106842, "epoch": 6.86, "learning_rate": 1.9002050857259743e-06, "loss": 0.1798, "step": 18990, "task_loss": 0.38829371333122253 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059168244362713555, "compression/movement_sparsity/importance_threshold": -0.07943713051604717, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14333003759384155, "epoch": 6.87, "learning_rate": 1.8993794428201775e-06, "loss": 0.1553, "step": 19000, "task_loss": 0.5657933950424194 }, { "epoch": 6.87, "eval_exact_match": 83.60454115421003, "eval_f1": 89.96767923298626, "step": 19000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059188397860476465, "compression/movement_sparsity/importance_threshold": -0.07916261009171532, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16756048798561096, "epoch": 6.87, "learning_rate": 1.8985505794258754e-06, "loss": 0.1683, "step": 19010, "task_loss": 0.4580082893371582 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059208504873430656, "compression/movement_sparsity/importance_threshold": -0.07888872285917803, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13767385482788086, "epoch": 6.87, "learning_rate": 1.8977184985110535e-06, "loss": 0.1644, "step": 19020, "task_loss": 0.33297887444496155 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059228565455247545, "compression/movement_sparsity/importance_threshold": -0.07861546808735143, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1662845015525818, "epoch": 6.88, "learning_rate": 1.8968832030552182e-06, "loss": 0.1624, "step": 19030, "task_loss": 0.7603532671928406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05924857965959853, "compression/movement_sparsity/importance_threshold": -0.07834284504515132, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14940595626831055, "epoch": 6.88, "learning_rate": 1.8960446960493872e-06, "loss": 0.168, "step": 19040, "task_loss": 0.357892781496048 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05926854754015506, "compression/movement_sparsity/importance_threshold": -0.07807085300149363, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1585623174905777, "epoch": 6.88, "learning_rate": 1.895202980496077e-06, "loss": 0.172, "step": 19050, "task_loss": 0.2738313674926758 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059288469150588496, "compression/movement_sparsity/importance_threshold": -0.07779949122529495, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14924949407577515, "epoch": 6.89, "learning_rate": 1.8943580594092942e-06, "loss": 0.1706, "step": 19060, "task_loss": 0.4149706959724426 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059308344544570284, "compression/movement_sparsity/importance_threshold": -0.07752875898547096, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12119010835886002, "epoch": 6.89, "learning_rate": 1.8935099358145233e-06, "loss": 0.1463, "step": 19070, "task_loss": 0.31337571144104004 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05932817377577181, "compression/movement_sparsity/importance_threshold": -0.0772586555509378, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13046619296073914, "epoch": 6.9, "learning_rate": 1.8926586127487165e-06, "loss": 0.1732, "step": 19080, "task_loss": 0.577675461769104 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05934795689786451, "compression/movement_sparsity/importance_threshold": -0.07698918019061152, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13301792740821838, "epoch": 6.9, "learning_rate": 1.8918040932602822e-06, "loss": 0.1594, "step": 19090, "task_loss": 0.38307079672813416 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059367693964519776, "compression/movement_sparsity/importance_threshold": -0.07672033217340835, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15103420615196228, "epoch": 6.9, "learning_rate": 1.8909463804090753e-06, "loss": 0.1592, "step": 19100, "task_loss": 0.5020593404769897 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059387385029409025, "compression/movement_sparsity/importance_threshold": -0.0764521107682441, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14066827297210693, "epoch": 6.91, "learning_rate": 1.890085477266385e-06, "loss": 0.1688, "step": 19110, "task_loss": 0.2139877825975418 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05940703014620367, "compression/movement_sparsity/importance_threshold": -0.07618451524403502, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1647796630859375, "epoch": 6.91, "learning_rate": 1.8892213869149238e-06, "loss": 0.1717, "step": 19120, "task_loss": 0.4196796417236328 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05942662936857511, "compression/movement_sparsity/importance_threshold": -0.07591754486969715, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17903751134872437, "epoch": 6.91, "learning_rate": 1.8883541124488178e-06, "loss": 0.1747, "step": 19130, "task_loss": 0.399991512298584 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059446182750194775, "compression/movement_sparsity/importance_threshold": -0.07565119891414651, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13912296295166016, "epoch": 6.92, "learning_rate": 1.8874836569735942e-06, "loss": 0.1705, "step": 19140, "task_loss": 0.6287709474563599 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05946569034473405, "compression/movement_sparsity/importance_threshold": -0.07538547664629924, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14840209484100342, "epoch": 6.92, "learning_rate": 1.8866100236061708e-06, "loss": 0.1693, "step": 19150, "task_loss": 0.45292603969573975 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05948515220586437, "compression/movement_sparsity/importance_threshold": -0.07512037733507126, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1409136801958084, "epoch": 6.92, "learning_rate": 1.885733215474845e-06, "loss": 0.1563, "step": 19160, "task_loss": 0.23241209983825684 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05950456838725712, "compression/movement_sparsity/importance_threshold": -0.07485590024937894, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2178533971309662, "epoch": 6.93, "learning_rate": 1.8848532357192824e-06, "loss": 0.1765, "step": 19170, "task_loss": 0.4487258791923523 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059523938942583746, "compression/movement_sparsity/importance_threshold": -0.07459204465813785, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13762247562408447, "epoch": 6.93, "learning_rate": 1.8839700874905046e-06, "loss": 0.169, "step": 19180, "task_loss": 0.3723566234111786 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059543263925515635, "compression/movement_sparsity/importance_threshold": -0.07432880983026457, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17009657621383667, "epoch": 6.94, "learning_rate": 1.8830837739508802e-06, "loss": 0.1631, "step": 19190, "task_loss": 0.23757542669773102 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05956254338972419, "compression/movement_sparsity/importance_threshold": -0.07406619503467493, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19928023219108582, "epoch": 6.94, "learning_rate": 1.8821942982741113e-06, "loss": 0.1832, "step": 19200, "task_loss": 0.39194488525390625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05958177738888084, "compression/movement_sparsity/importance_threshold": -0.07380419954028494, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13645857572555542, "epoch": 6.94, "learning_rate": 1.8813016636452228e-06, "loss": 0.155, "step": 19210, "task_loss": 0.3137187361717224 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059600965976657, "compression/movement_sparsity/importance_threshold": -0.07354282261601064, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1318303644657135, "epoch": 6.95, "learning_rate": 1.8804058732605516e-06, "loss": 0.1635, "step": 19220, "task_loss": 0.3633446991443634 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05962010920672406, "compression/movement_sparsity/importance_threshold": -0.07328206353076827, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1735188364982605, "epoch": 6.95, "learning_rate": 1.879506930327735e-06, "loss": 0.1601, "step": 19230, "task_loss": 0.5617966651916504 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05963920713275343, "compression/movement_sparsity/importance_threshold": -0.07302192155347387, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1602572202682495, "epoch": 6.95, "learning_rate": 1.8786048380656979e-06, "loss": 0.1736, "step": 19240, "task_loss": 0.35640949010849 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05965825980841654, "compression/movement_sparsity/importance_threshold": -0.07276239595304346, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17654402554035187, "epoch": 6.96, "learning_rate": 1.8776995997046424e-06, "loss": 0.1577, "step": 19250, "task_loss": 0.3391542434692383 }, { "epoch": 6.96, "eval_exact_match": 83.50047303689688, "eval_f1": 89.92843125598556, "step": 19250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05967726728738478, "compression/movement_sparsity/importance_threshold": -0.07250348599839318, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12063628435134888, "epoch": 6.96, "learning_rate": 1.876791218486038e-06, "loss": 0.1641, "step": 19260, "task_loss": 0.405428946018219 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059696229623329586, "compression/movement_sparsity/importance_threshold": -0.07224519095843895, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.148264080286026, "epoch": 6.96, "learning_rate": 1.8758796976626056e-06, "loss": 0.1597, "step": 19270, "task_loss": 0.3504221439361572 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05971514686992235, "compression/movement_sparsity/importance_threshold": -0.07198751010209681, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18574409186840057, "epoch": 6.97, "learning_rate": 1.8749650404983096e-06, "loss": 0.1745, "step": 19280, "task_loss": 0.5480473041534424 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05973401908083449, "compression/movement_sparsity/importance_threshold": -0.07173044269828299, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1708531677722931, "epoch": 6.97, "learning_rate": 1.8740472502683445e-06, "loss": 0.1658, "step": 19290, "task_loss": 0.329237699508667 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05975284630973741, "compression/movement_sparsity/importance_threshold": -0.07147398801591365, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14285686612129211, "epoch": 6.98, "learning_rate": 1.8731263302591249e-06, "loss": 0.1752, "step": 19300, "task_loss": 0.46866947412490845 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05977162861030252, "compression/movement_sparsity/importance_threshold": -0.07121814532390458, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.149366557598114, "epoch": 6.98, "learning_rate": 1.8722022837682707e-06, "loss": 0.1703, "step": 19310, "task_loss": 0.5933884382247925 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059790366036201234, "compression/movement_sparsity/importance_threshold": -0.07096291389117193, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16207867860794067, "epoch": 6.98, "learning_rate": 1.8712751141045983e-06, "loss": 0.1811, "step": 19320, "task_loss": 0.36161190271377563 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05980905864110496, "compression/movement_sparsity/importance_threshold": -0.07070829298663195, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17707979679107666, "epoch": 6.99, "learning_rate": 1.8703448245881071e-06, "loss": 0.1718, "step": 19330, "task_loss": 0.3630777597427368 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05982770647868512, "compression/movement_sparsity/importance_threshold": -0.07045428187920033, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14741402864456177, "epoch": 6.99, "learning_rate": 1.8694114185499679e-06, "loss": 0.172, "step": 19340, "task_loss": 0.31712833046913147 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05984630960261311, "compression/movement_sparsity/importance_threshold": -0.07020087983779355, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13535742461681366, "epoch": 6.99, "learning_rate": 1.8684748993325111e-06, "loss": 0.1702, "step": 19350, "task_loss": 0.33675703406333923 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05986486806656035, "compression/movement_sparsity/importance_threshold": -0.06994808613132752, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16717709600925446, "epoch": 7.0, "learning_rate": 1.8675352702892155e-06, "loss": 0.1711, "step": 19360, "task_loss": 0.6665349006652832 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05988338192419825, "compression/movement_sparsity/importance_threshold": -0.06969590002871806, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17918677628040314, "epoch": 7.0, "learning_rate": 1.866592534784695e-06, "loss": 0.1832, "step": 19370, "task_loss": 0.6708546876907349 }, { "compression/movement_sparsity/importance_regularization_factor": 0.059901851229198215, "compression/movement_sparsity/importance_threshold": -0.06944432079888163, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14213892817497253, "epoch": 7.0, "learning_rate": 1.8656466961946862e-06, "loss": 0.169, "step": 19380, "task_loss": 0.372379332780838 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05992027603523166, "compression/movement_sparsity/importance_threshold": -0.06919334771073415, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17070825397968292, "epoch": 7.01, "learning_rate": 1.8646977579060389e-06, "loss": 0.1609, "step": 19390, "task_loss": 0.3748677968978882 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05993865639596999, "compression/movement_sparsity/importance_threshold": -0.06894298003319155, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2126743048429489, "epoch": 7.01, "learning_rate": 1.8637457233167005e-06, "loss": 0.1668, "step": 19400, "task_loss": 0.5745400190353394 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05995699236508463, "compression/movement_sparsity/importance_threshold": -0.06869321703517006, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.130209818482399, "epoch": 7.01, "learning_rate": 1.8627905958357073e-06, "loss": 0.1665, "step": 19410, "task_loss": 0.34854692220687866 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05997528399624697, "compression/movement_sparsity/importance_threshold": -0.06844405798558584, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15571898221969604, "epoch": 7.02, "learning_rate": 1.8618323788831697e-06, "loss": 0.1792, "step": 19420, "task_loss": 0.6182477474212646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.05999353134312843, "compression/movement_sparsity/importance_threshold": -0.06819550215335468, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1204761266708374, "epoch": 7.02, "learning_rate": 1.8608710758902607e-06, "loss": 0.1644, "step": 19430, "task_loss": 0.415132999420166 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06001173445940043, "compression/movement_sparsity/importance_threshold": -0.06794754880739273, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18349982798099518, "epoch": 7.03, "learning_rate": 1.859906690299204e-06, "loss": 0.1842, "step": 19440, "task_loss": 0.5370660424232483 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06002989339873437, "compression/movement_sparsity/importance_threshold": -0.06770019721661613, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1359555423259735, "epoch": 7.03, "learning_rate": 1.8589392255632617e-06, "loss": 0.1724, "step": 19450, "task_loss": 0.41707509756088257 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060048008214801656, "compression/movement_sparsity/importance_threshold": -0.06745344664994113, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15165475010871887, "epoch": 7.03, "learning_rate": 1.857968685146721e-06, "loss": 0.1643, "step": 19460, "task_loss": 0.43051207065582275 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06006607896127371, "compression/movement_sparsity/importance_threshold": -0.06720729637628331, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1730663776397705, "epoch": 7.04, "learning_rate": 1.8569950725248831e-06, "loss": 0.1654, "step": 19470, "task_loss": 0.5514428615570068 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06008410569182194, "compression/movement_sparsity/importance_threshold": -0.06696174566455915, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15489190816879272, "epoch": 7.04, "learning_rate": 1.85601839118405e-06, "loss": 0.1711, "step": 19480, "task_loss": 0.3652356266975403 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06010208846011775, "compression/movement_sparsity/importance_threshold": -0.06671679378368456, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12107902020215988, "epoch": 7.04, "learning_rate": 1.8550386446215121e-06, "loss": 0.1532, "step": 19490, "task_loss": 0.49454283714294434 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060120027319832556, "compression/movement_sparsity/importance_threshold": -0.06647244000257568, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1640836000442505, "epoch": 7.05, "learning_rate": 1.8540558363455353e-06, "loss": 0.1532, "step": 19500, "task_loss": 0.3784339427947998 }, { "epoch": 7.05, "eval_exact_match": 83.57615894039735, "eval_f1": 89.98257156583203, "step": 19500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06013792232463777, "compression/movement_sparsity/importance_threshold": -0.06622868359014844, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1305304914712906, "epoch": 7.05, "learning_rate": 1.8530699698753494e-06, "loss": 0.1526, "step": 19510, "task_loss": 0.4855495095252991 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0601557735282048, "compression/movement_sparsity/importance_threshold": -0.06598552381531908, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.121201291680336, "epoch": 7.05, "learning_rate": 1.8520810487411347e-06, "loss": 0.1668, "step": 19520, "task_loss": 0.5691750049591064 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060173580984205055, "compression/movement_sparsity/importance_threshold": -0.06574295994700352, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14304503798484802, "epoch": 7.06, "learning_rate": 1.8510890764840098e-06, "loss": 0.1623, "step": 19530, "task_loss": 0.4884259104728699 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060191344746309945, "compression/movement_sparsity/importance_threshold": -0.06550099125411801, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1405264437198639, "epoch": 7.06, "learning_rate": 1.8500940566560187e-06, "loss": 0.1557, "step": 19540, "task_loss": 0.7580517530441284 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060209064868190894, "compression/movement_sparsity/importance_threshold": -0.06525961700557836, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14326691627502441, "epoch": 7.07, "learning_rate": 1.8490959928201173e-06, "loss": 0.1639, "step": 19550, "task_loss": 0.7660905122756958 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06022674140351929, "compression/movement_sparsity/importance_threshold": -0.06501883647030082, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13954105973243713, "epoch": 7.07, "learning_rate": 1.8480948885501627e-06, "loss": 0.1734, "step": 19560, "task_loss": 0.5161733627319336 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06024437440596656, "compression/movement_sparsity/importance_threshold": -0.06477864891720131, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1333414614200592, "epoch": 7.07, "learning_rate": 1.847090747430899e-06, "loss": 0.1631, "step": 19570, "task_loss": 0.24418434500694275 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0602619639292041, "compression/movement_sparsity/importance_threshold": -0.0645390536151963, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13161346316337585, "epoch": 7.08, "learning_rate": 1.8460835730579434e-06, "loss": 0.1733, "step": 19580, "task_loss": 0.5065572261810303 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06027951002690334, "compression/movement_sparsity/importance_threshold": -0.06430004983320126, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12809817492961884, "epoch": 7.08, "learning_rate": 1.8450733690377757e-06, "loss": 0.1826, "step": 19590, "task_loss": 0.36705049872398376 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06029701275273568, "compression/movement_sparsity/importance_threshold": -0.06406163684013266, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1483440101146698, "epoch": 7.08, "learning_rate": 1.8440601389877241e-06, "loss": 0.1648, "step": 19600, "task_loss": 0.2816522419452667 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060314472160372536, "compression/movement_sparsity/importance_threshold": -0.06382381390490643, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16983579099178314, "epoch": 7.09, "learning_rate": 1.843043886535952e-06, "loss": 0.1687, "step": 19610, "task_loss": 0.5111091136932373 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060331888303485307, "compression/movement_sparsity/importance_threshold": -0.06358658029643882, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1196693629026413, "epoch": 7.09, "learning_rate": 1.8420246153214451e-06, "loss": 0.154, "step": 19620, "task_loss": 0.24485935270786285 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06034926123574542, "compression/movement_sparsity/importance_threshold": -0.06334993528364552, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11787790060043335, "epoch": 7.09, "learning_rate": 1.841002328994e-06, "loss": 0.1685, "step": 19630, "task_loss": 0.6096109747886658 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06036659101082427, "compression/movement_sparsity/importance_threshold": -0.06311387813544289, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1214839369058609, "epoch": 7.1, "learning_rate": 1.8399770312142082e-06, "loss": 0.1656, "step": 19640, "task_loss": 0.24309919774532318 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06038387768239327, "compression/movement_sparsity/importance_threshold": -0.06287840812074696, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12063048779964447, "epoch": 7.1, "learning_rate": 1.8389487256534456e-06, "loss": 0.163, "step": 19650, "task_loss": 0.29860949516296387 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06040112130412384, "compression/movement_sparsity/importance_threshold": -0.06264352450847377, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18652105331420898, "epoch": 7.11, "learning_rate": 1.8379174159938578e-06, "loss": 0.1688, "step": 19660, "task_loss": 0.5233631134033203 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06041832192968739, "compression/movement_sparsity/importance_threshold": -0.06240922656753922, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1551060527563095, "epoch": 7.11, "learning_rate": 1.8368831059283476e-06, "loss": 0.1645, "step": 19670, "task_loss": 0.529505729675293 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060435479612755305, "compression/movement_sparsity/importance_threshold": -0.0621755135668598, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15519507229328156, "epoch": 7.11, "learning_rate": 1.835845799160562e-06, "loss": 0.1601, "step": 19680, "task_loss": 0.6162427067756653 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06045259440699903, "compression/movement_sparsity/importance_threshold": -0.06194238477535119, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1276751011610031, "epoch": 7.12, "learning_rate": 1.8348054994048783e-06, "loss": 0.1812, "step": 19690, "task_loss": 0.36186686158180237 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06046966636608997, "compression/movement_sparsity/importance_threshold": -0.06170983946192954, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1895449459552765, "epoch": 7.12, "learning_rate": 1.8337622103863906e-06, "loss": 0.1648, "step": 19700, "task_loss": 0.6113171577453613 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06048669554369951, "compression/movement_sparsity/importance_threshold": -0.0614778768955111, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12751251459121704, "epoch": 7.12, "learning_rate": 1.832715935840897e-06, "loss": 0.1482, "step": 19710, "task_loss": 0.271990031003952 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06050368199349909, "compression/movement_sparsity/importance_threshold": -0.06124649634501167, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1760406345129013, "epoch": 7.13, "learning_rate": 1.8316666795148873e-06, "loss": 0.175, "step": 19720, "task_loss": 0.5232373476028442 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06052062576916011, "compression/movement_sparsity/importance_threshold": -0.0610156970793474, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1406281739473343, "epoch": 7.13, "learning_rate": 1.8306144451655273e-06, "loss": 0.1719, "step": 19730, "task_loss": 0.2717032730579376 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06053752692435397, "compression/movement_sparsity/importance_threshold": -0.06078547836743464, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14542102813720703, "epoch": 7.13, "learning_rate": 1.8295592365606462e-06, "loss": 0.1952, "step": 19740, "task_loss": 0.4864187240600586 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0605543855127521, "compression/movement_sparsity/importance_threshold": -0.06055583947818899, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15917694568634033, "epoch": 7.14, "learning_rate": 1.8285010574787249e-06, "loss": 0.1618, "step": 19750, "task_loss": 0.5574923753738403 }, { "epoch": 7.14, "eval_exact_match": 83.54777672658467, "eval_f1": 90.08027567236861, "step": 19750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0605712015880259, "compression/movement_sparsity/importance_threshold": -0.06032677968052669, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19061726331710815, "epoch": 7.14, "learning_rate": 1.827439911708879e-06, "loss": 0.1815, "step": 19760, "task_loss": 0.5931074023246765 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060587975203846775, "compression/movement_sparsity/importance_threshold": -0.06009829824336399, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14040836691856384, "epoch": 7.14, "learning_rate": 1.8263758030508489e-06, "loss": 0.173, "step": 19770, "task_loss": 0.8596678972244263 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06060470641388614, "compression/movement_sparsity/importance_threshold": -0.059870394435616814, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1635446846485138, "epoch": 7.15, "learning_rate": 1.8253087353149833e-06, "loss": 0.1812, "step": 19780, "task_loss": 0.29845842719078064 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06062139527181541, "compression/movement_sparsity/importance_threshold": -0.05964306752620119, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1550963819026947, "epoch": 7.15, "learning_rate": 1.8242387123222275e-06, "loss": 0.1731, "step": 19790, "task_loss": 0.542272686958313 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060638041831306, "compression/movement_sparsity/importance_threshold": -0.05941631678403325, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12241744995117188, "epoch": 7.16, "learning_rate": 1.8231657379041089e-06, "loss": 0.1505, "step": 19800, "task_loss": 0.41791096329689026 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060654646146029306, "compression/movement_sparsity/importance_threshold": -0.059190141478029146, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12771350145339966, "epoch": 7.16, "learning_rate": 1.8220898159027223e-06, "loss": 0.1572, "step": 19810, "task_loss": 0.4617471396923065 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060671208269656746, "compression/movement_sparsity/importance_threshold": -0.05896454087710479, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1922406703233719, "epoch": 7.16, "learning_rate": 1.8210109501707184e-06, "loss": 0.1773, "step": 19820, "task_loss": 0.39758336544036865 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06068772825585973, "compression/movement_sparsity/importance_threshold": -0.058739514250176206, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1719903200864792, "epoch": 7.17, "learning_rate": 1.8199291445712883e-06, "loss": 0.1861, "step": 19830, "task_loss": 0.5382665395736694 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060704206158309675, "compression/movement_sparsity/importance_threshold": -0.05851506086615965, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14048738777637482, "epoch": 7.17, "learning_rate": 1.8188444029781502e-06, "loss": 0.1595, "step": 19840, "task_loss": 0.169685959815979 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06072064203067798, "compression/movement_sparsity/importance_threshold": -0.05829117999397104, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1779119223356247, "epoch": 7.17, "learning_rate": 1.8177567292755352e-06, "loss": 0.167, "step": 19850, "task_loss": 0.4589795470237732 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06073703592663606, "compression/movement_sparsity/importance_threshold": -0.05806787090252663, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15580281615257263, "epoch": 7.18, "learning_rate": 1.8166661273581744e-06, "loss": 0.1726, "step": 19860, "task_loss": 0.2717253267765045 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06075338789985533, "compression/movement_sparsity/importance_threshold": -0.05784513286074233, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17201679944992065, "epoch": 7.18, "learning_rate": 1.8155726011312838e-06, "loss": 0.1703, "step": 19870, "task_loss": 0.46242088079452515 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060769698004007194, "compression/movement_sparsity/importance_threshold": -0.05762296513753418, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14266745746135712, "epoch": 7.18, "learning_rate": 1.8144761545105498e-06, "loss": 0.16, "step": 19880, "task_loss": 0.3505728542804718 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060785966292763075, "compression/movement_sparsity/importance_threshold": -0.05740136700181819, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19506075978279114, "epoch": 7.19, "learning_rate": 1.8133767914221179e-06, "loss": 0.159, "step": 19890, "task_loss": 0.731766939163208 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06080219281979437, "compression/movement_sparsity/importance_threshold": -0.05718033772251074, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14748713374137878, "epoch": 7.19, "learning_rate": 1.8122745158025756e-06, "loss": 0.1759, "step": 19900, "task_loss": 0.5381277203559875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060818377638772486, "compression/movement_sparsity/importance_threshold": -0.056959876568527634, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1392291784286499, "epoch": 7.2, "learning_rate": 1.81116933159894e-06, "loss": 0.1629, "step": 19910, "task_loss": 0.3705242872238159 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060834520803368854, "compression/movement_sparsity/importance_threshold": -0.056739982808785006, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13060082495212555, "epoch": 7.2, "learning_rate": 1.810061242768643e-06, "loss": 0.1549, "step": 19920, "task_loss": 0.4211186468601227 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06085062236725487, "compression/movement_sparsity/importance_threshold": -0.056520655712198775, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12718503177165985, "epoch": 7.2, "learning_rate": 1.8089502532795175e-06, "loss": 0.1536, "step": 19930, "task_loss": 0.36818772554397583 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060866682384101936, "compression/movement_sparsity/importance_threshold": -0.056301894547685305, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14205113053321838, "epoch": 7.21, "learning_rate": 1.807836367109783e-06, "loss": 0.1711, "step": 19940, "task_loss": 0.579256534576416 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06088270090758148, "compression/movement_sparsity/importance_threshold": -0.05608369858416051, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1530269980430603, "epoch": 7.21, "learning_rate": 1.8067195882480321e-06, "loss": 0.1737, "step": 19950, "task_loss": 0.43539872765541077 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06089867799136491, "compression/movement_sparsity/importance_threshold": -0.05586606709054043, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18445491790771484, "epoch": 7.21, "learning_rate": 1.805599920693214e-06, "loss": 0.1709, "step": 19960, "task_loss": 0.651823103427887 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06091461368912364, "compression/movement_sparsity/importance_threshold": -0.05564899933574119, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12383359670639038, "epoch": 7.22, "learning_rate": 1.8044773684546228e-06, "loss": 0.1732, "step": 19970, "task_loss": 0.5851423740386963 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06093050805452906, "compression/movement_sparsity/importance_threshold": -0.055432494588678716, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1711319088935852, "epoch": 7.22, "learning_rate": 1.8033519355518822e-06, "loss": 0.1678, "step": 19980, "task_loss": 0.5596441030502319 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060946361141252604, "compression/movement_sparsity/importance_threshold": -0.05521655211826926, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15698492527008057, "epoch": 7.22, "learning_rate": 1.8022236260149303e-06, "loss": 0.1647, "step": 19990, "task_loss": 0.33208391070365906 }, { "compression/movement_sparsity/importance_regularization_factor": 0.060962173002965664, "compression/movement_sparsity/importance_threshold": -0.055001171193428844, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15464134514331818, "epoch": 7.23, "learning_rate": 1.8010924438840057e-06, "loss": 0.1562, "step": 20000, "task_loss": 0.3163996636867523 }, { "epoch": 7.23, "eval_exact_match": 83.44370860927152, "eval_f1": 89.87271615204905, "step": 20000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06097794369333966, "compression/movement_sparsity/importance_threshold": -0.05478635108307339, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16933295130729675, "epoch": 7.23, "learning_rate": 1.7999583932096346e-06, "loss": 0.1647, "step": 20010, "task_loss": 0.45220547914505005 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06099367326604601, "compression/movement_sparsity/importance_threshold": -0.054572091056119154, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14466992020606995, "epoch": 7.24, "learning_rate": 1.7988214780526128e-06, "loss": 0.1618, "step": 20020, "task_loss": 0.282554566860199 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06100936177475611, "compression/movement_sparsity/importance_threshold": -0.05435839038148216, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13395962119102478, "epoch": 7.24, "learning_rate": 1.7976817024839943e-06, "loss": 0.1693, "step": 20030, "task_loss": 0.6613667011260986 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061025009273141376, "compression/movement_sparsity/importance_threshold": -0.05414524832807843, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.22142064571380615, "epoch": 7.24, "learning_rate": 1.796539070585076e-06, "loss": 0.1735, "step": 20040, "task_loss": 0.6772729754447937 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06104061581487323, "compression/movement_sparsity/importance_threshold": -0.053932664164823896, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.145456463098526, "epoch": 7.25, "learning_rate": 1.7953935864473823e-06, "loss": 0.1496, "step": 20050, "task_loss": 0.14888328313827515 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061056181453623064, "compression/movement_sparsity/importance_threshold": -0.05372063716063491, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14171883463859558, "epoch": 7.25, "learning_rate": 1.7942452541726505e-06, "loss": 0.1598, "step": 20060, "task_loss": 0.5084316730499268 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06107170624306229, "compression/movement_sparsity/importance_threshold": -0.05350916658442739, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15908685326576233, "epoch": 7.25, "learning_rate": 1.7930940778728165e-06, "loss": 0.1678, "step": 20070, "task_loss": 0.8238155841827393 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06108719023686233, "compression/movement_sparsity/importance_threshold": -0.05329825170511748, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15507642924785614, "epoch": 7.26, "learning_rate": 1.791940061670001e-06, "loss": 0.158, "step": 20080, "task_loss": 0.4280283451080322 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0611026334886946, "compression/movement_sparsity/importance_threshold": -0.05308789179162099, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14263886213302612, "epoch": 7.26, "learning_rate": 1.790783209696493e-06, "loss": 0.1545, "step": 20090, "task_loss": 0.33214807510375977 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061118036052230484, "compression/movement_sparsity/importance_threshold": -0.05287808611285438, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16570046544075012, "epoch": 7.26, "learning_rate": 1.789623526094736e-06, "loss": 0.1755, "step": 20100, "task_loss": 0.5707616209983826 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061133397981141424, "compression/movement_sparsity/importance_threshold": -0.05266883393773336, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1842001974582672, "epoch": 7.27, "learning_rate": 1.7884610150173121e-06, "loss": 0.1682, "step": 20110, "task_loss": 0.5207316279411316 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06114871932909881, "compression/movement_sparsity/importance_threshold": -0.052460134535174285, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16815443336963654, "epoch": 7.27, "learning_rate": 1.78729568062693e-06, "loss": 0.1692, "step": 20120, "task_loss": 0.801066517829895 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06116400014977406, "compression/movement_sparsity/importance_threshold": -0.05225198717409296, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1244630366563797, "epoch": 7.28, "learning_rate": 1.7861275270964063e-06, "loss": 0.1751, "step": 20130, "task_loss": 0.18696001172065735 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06117924049683858, "compression/movement_sparsity/importance_threshold": -0.05204439112340564, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16450370848178864, "epoch": 7.28, "learning_rate": 1.7849565586086527e-06, "loss": 0.1742, "step": 20140, "task_loss": 0.5837537050247192 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06119444042396378, "compression/movement_sparsity/importance_threshold": -0.051837345652028355, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1398741602897644, "epoch": 7.28, "learning_rate": 1.7837827793566615e-06, "loss": 0.1666, "step": 20150, "task_loss": 0.35871630907058716 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061209599984821075, "compression/movement_sparsity/importance_threshold": -0.05163085002887713, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19207346439361572, "epoch": 7.29, "learning_rate": 1.7826061935434892e-06, "loss": 0.1716, "step": 20160, "task_loss": 0.5226284861564636 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061224719233081884, "compression/movement_sparsity/importance_threshold": -0.05142490352286799, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1372203230857849, "epoch": 7.29, "learning_rate": 1.781426805382241e-06, "loss": 0.1636, "step": 20170, "task_loss": 0.514137864112854 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0612397982224176, "compression/movement_sparsity/importance_threshold": -0.05121950540291709, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1332128494977951, "epoch": 7.29, "learning_rate": 1.780244619096059e-06, "loss": 0.1751, "step": 20180, "task_loss": 0.2380894273519516 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06125483700649964, "compression/movement_sparsity/importance_threshold": -0.05101465493794055, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1710924506187439, "epoch": 7.3, "learning_rate": 1.7790596389181026e-06, "loss": 0.1642, "step": 20190, "task_loss": 0.2540561854839325 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06126983563899942, "compression/movement_sparsity/importance_threshold": -0.050810351396854414, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15287163853645325, "epoch": 7.3, "learning_rate": 1.7778718690915366e-06, "loss": 0.1765, "step": 20200, "task_loss": 0.2441101372241974 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06128479417358835, "compression/movement_sparsity/importance_threshold": -0.05060659404857448, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17295891046524048, "epoch": 7.3, "learning_rate": 1.776681313869515e-06, "loss": 0.1763, "step": 20210, "task_loss": 0.40276211500167847 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061299712663937835, "compression/movement_sparsity/importance_threshold": -0.05040338216201712, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1395740807056427, "epoch": 7.31, "learning_rate": 1.7754879775151655e-06, "loss": 0.1655, "step": 20220, "task_loss": 0.5500630140304565 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06131459116371929, "compression/movement_sparsity/importance_threshold": -0.05020071500609835, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16768819093704224, "epoch": 7.31, "learning_rate": 1.774291864301574e-06, "loss": 0.1599, "step": 20230, "task_loss": 0.3271426856517792 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061329429726604115, "compression/movement_sparsity/importance_threshold": -0.04999859184973421, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12167343497276306, "epoch": 7.31, "learning_rate": 1.7730929785117707e-06, "loss": 0.1561, "step": 20240, "task_loss": 0.19313254952430725 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061344228406263746, "compression/movement_sparsity/importance_threshold": -0.04979701196184061, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15813778340816498, "epoch": 7.32, "learning_rate": 1.7718913244387133e-06, "loss": 0.1774, "step": 20250, "task_loss": 0.46160903573036194 }, { "epoch": 7.32, "eval_exact_match": 83.61400189214758, "eval_f1": 89.96683277095251, "step": 20250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06135898725636957, "compression/movement_sparsity/importance_threshold": -0.049595974611333804, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1122448593378067, "epoch": 7.32, "learning_rate": 1.7706869063852716e-06, "loss": 0.1572, "step": 20260, "task_loss": 0.2982158660888672 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061373706330593, "compression/movement_sparsity/importance_threshold": -0.04939547906712993, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15496289730072021, "epoch": 7.33, "learning_rate": 1.7694797286642137e-06, "loss": 0.1866, "step": 20270, "task_loss": 0.3803407549858093 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06138838568260545, "compression/movement_sparsity/importance_threshold": -0.04919552459814491, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19763313233852386, "epoch": 7.33, "learning_rate": 1.7683909127719155e-06, "loss": 0.1794, "step": 20280, "task_loss": 0.3198865056037903 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061403025366078334, "compression/movement_sparsity/importance_threshold": -0.048996110473294774, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14763984084129333, "epoch": 7.33, "learning_rate": 1.7671785035994402e-06, "loss": 0.1589, "step": 20290, "task_loss": 0.3247777223587036 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061417625434683064, "compression/movement_sparsity/importance_threshold": -0.048797235961495655, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16530966758728027, "epoch": 7.34, "learning_rate": 1.7659633473222004e-06, "loss": 0.1628, "step": 20300, "task_loss": 0.3593147397041321 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061432185942091044, "compression/movement_sparsity/importance_threshold": -0.048598900331663586, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18628555536270142, "epoch": 7.34, "learning_rate": 1.7647454482914155e-06, "loss": 0.1713, "step": 20310, "task_loss": 0.42950117588043213 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06144670694197368, "compression/movement_sparsity/importance_threshold": -0.04840110285271482, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13749028742313385, "epoch": 7.34, "learning_rate": 1.7635248108681248e-06, "loss": 0.168, "step": 20320, "task_loss": 0.5608773231506348 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06146118848800241, "compression/movement_sparsity/importance_threshold": -0.048203842793565155, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1740993857383728, "epoch": 7.35, "learning_rate": 1.762301439423175e-06, "loss": 0.1668, "step": 20330, "task_loss": 0.5474882125854492 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061475630633848616, "compression/movement_sparsity/importance_threshold": -0.04800711942313074, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17211446166038513, "epoch": 7.35, "learning_rate": 1.7610753383372007e-06, "loss": 0.1616, "step": 20340, "task_loss": 0.5157222747802734 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06149003343318371, "compression/movement_sparsity/importance_threshold": -0.04781093201032771, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17125245928764343, "epoch": 7.35, "learning_rate": 1.7598465120006126e-06, "loss": 0.1605, "step": 20350, "task_loss": 0.32079434394836426 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06150439693967911, "compression/movement_sparsity/importance_threshold": -0.04761527982407221, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15011651813983917, "epoch": 7.36, "learning_rate": 1.7586149648135792e-06, "loss": 0.1772, "step": 20360, "task_loss": 0.3235490918159485 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06151872120700623, "compression/movement_sparsity/importance_threshold": -0.04742016213328004, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1297205686569214, "epoch": 7.36, "learning_rate": 1.7573807011860113e-06, "loss": 0.1701, "step": 20370, "task_loss": 0.36008718609809875 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061533006288836484, "compression/movement_sparsity/importance_threshold": -0.04722557820686746, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15162158012390137, "epoch": 7.37, "learning_rate": 1.7561437255375478e-06, "loss": 0.1748, "step": 20380, "task_loss": 0.6992220282554626 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06154725223884127, "compression/movement_sparsity/importance_threshold": -0.04703152731375049, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12533384561538696, "epoch": 7.37, "learning_rate": 1.7549040422975377e-06, "loss": 0.1749, "step": 20390, "task_loss": 0.4742255210876465 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061561459110691996, "compression/movement_sparsity/importance_threshold": -0.04683800872284527, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15761375427246094, "epoch": 7.37, "learning_rate": 1.7536616559050254e-06, "loss": 0.1763, "step": 20400, "task_loss": 0.5680642127990723 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0615756269580601, "compression/movement_sparsity/importance_threshold": -0.046645021703067724, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11964351683855057, "epoch": 7.38, "learning_rate": 1.7524165708087364e-06, "loss": 0.1665, "step": 20410, "task_loss": 0.1860896497964859 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061589755834616955, "compression/movement_sparsity/importance_threshold": -0.04645256552333399, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12572947144508362, "epoch": 7.38, "learning_rate": 1.7511687914670574e-06, "loss": 0.1527, "step": 20420, "task_loss": 0.4305400550365448 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061603845794034, "compression/movement_sparsity/importance_threshold": -0.046260639452560204, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12916436791419983, "epoch": 7.38, "learning_rate": 1.7499183223480233e-06, "loss": 0.1843, "step": 20430, "task_loss": 0.461367130279541 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061617896889982635, "compression/movement_sparsity/importance_threshold": -0.04606924275966229, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1794300675392151, "epoch": 7.39, "learning_rate": 1.7486651679293021e-06, "loss": 0.1691, "step": 20440, "task_loss": 0.4559490382671356 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06163190917613426, "compression/movement_sparsity/importance_threshold": -0.0458783747135566, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16324125230312347, "epoch": 7.39, "learning_rate": 1.7474093326981751e-06, "loss": 0.166, "step": 20450, "task_loss": 0.5274643898010254 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06164588270616032, "compression/movement_sparsity/importance_threshold": -0.04568803458315873, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1191713958978653, "epoch": 7.39, "learning_rate": 1.7461508211515242e-06, "loss": 0.1474, "step": 20460, "task_loss": 0.48396289348602295 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06165981753373218, "compression/movement_sparsity/importance_threshold": -0.04549822163738515, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12512516975402832, "epoch": 7.4, "learning_rate": 1.7448896377958144e-06, "loss": 0.1547, "step": 20470, "task_loss": 0.33188262581825256 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06167371371252128, "compression/movement_sparsity/importance_threshold": -0.045308935145151774, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13970869779586792, "epoch": 7.4, "learning_rate": 1.743625787147078e-06, "loss": 0.1544, "step": 20480, "task_loss": 0.5282965302467346 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06168757129619902, "compression/movement_sparsity/importance_threshold": -0.045120174375374744, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1316584199666977, "epoch": 7.41, "learning_rate": 1.7423592737308973e-06, "loss": 0.1702, "step": 20490, "task_loss": 0.2873547673225403 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06170139033843682, "compression/movement_sparsity/importance_threshold": -0.04493193859696998, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16573722660541534, "epoch": 7.41, "learning_rate": 1.7410901020823918e-06, "loss": 0.1697, "step": 20500, "task_loss": 0.39910686016082764 }, { "epoch": 7.41, "eval_exact_match": 83.71807000946073, "eval_f1": 90.07741181805484, "step": 20500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06171517089290609, "compression/movement_sparsity/importance_threshold": -0.04474422707885373, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13655081391334534, "epoch": 7.41, "learning_rate": 1.7398182767461971e-06, "loss": 0.1551, "step": 20510, "task_loss": 0.2691548466682434 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06172891301327822, "compression/movement_sparsity/importance_threshold": -0.04455703908994191, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16987422108650208, "epoch": 7.42, "learning_rate": 1.7385438022764523e-06, "loss": 0.1721, "step": 20520, "task_loss": 0.4175964593887329 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06174261675322465, "compression/movement_sparsity/importance_threshold": -0.04437037389915055, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13351169228553772, "epoch": 7.42, "learning_rate": 1.7372666832367822e-06, "loss": 0.1573, "step": 20530, "task_loss": 0.46691519021987915 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06175628216641678, "compression/movement_sparsity/importance_threshold": -0.044184230775395905, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16343604028224945, "epoch": 7.42, "learning_rate": 1.7359869242002813e-06, "loss": 0.1737, "step": 20540, "task_loss": 0.5338377356529236 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06176990930652601, "compression/movement_sparsity/importance_threshold": -0.04399860898759389, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13753941655158997, "epoch": 7.43, "learning_rate": 1.7347045297494976e-06, "loss": 0.1782, "step": 20550, "task_loss": 0.4684370756149292 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06178349822722375, "compression/movement_sparsity/importance_threshold": -0.043813507804660756, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13331389427185059, "epoch": 7.43, "learning_rate": 1.7334195044764152e-06, "loss": 0.1693, "step": 20560, "task_loss": 0.42766016721725464 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061797048982181434, "compression/movement_sparsity/importance_threshold": -0.04362892649551231, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15660671889781952, "epoch": 7.43, "learning_rate": 1.73213185298244e-06, "loss": 0.1692, "step": 20570, "task_loss": 0.4442064166069031 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06181056162507043, "compression/movement_sparsity/importance_threshold": -0.043444864329064914, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1403474658727646, "epoch": 7.44, "learning_rate": 1.7308415798783801e-06, "loss": 0.166, "step": 20580, "task_loss": 0.44031822681427 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0618240362095622, "compression/movement_sparsity/importance_threshold": -0.043261320574234374, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14684978127479553, "epoch": 7.44, "learning_rate": 1.7295486897844326e-06, "loss": 0.162, "step": 20590, "task_loss": 0.3407401144504547 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06183747278932813, "compression/movement_sparsity/importance_threshold": -0.04307829449993672, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13846829533576965, "epoch": 7.44, "learning_rate": 1.7282531873301647e-06, "loss": 0.1647, "step": 20600, "task_loss": 0.3741005063056946 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06185087141803962, "compression/movement_sparsity/importance_threshold": -0.042895785375088424, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15826284885406494, "epoch": 7.45, "learning_rate": 1.7269550771544977e-06, "loss": 0.1675, "step": 20610, "task_loss": 0.5317988991737366 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0618642321493681, "compression/movement_sparsity/importance_threshold": -0.04271379246860507, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20660480856895447, "epoch": 7.45, "learning_rate": 1.7256543639056912e-06, "loss": 0.1728, "step": 20620, "task_loss": 0.6406430006027222 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06187755503698497, "compression/movement_sparsity/importance_threshold": -0.04253231504940291, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19647446274757385, "epoch": 7.46, "learning_rate": 1.7243510522413259e-06, "loss": 0.1688, "step": 20630, "task_loss": 0.5294222831726074 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06189084013456164, "compression/movement_sparsity/importance_threshold": -0.0423513523863982, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1321870982646942, "epoch": 7.46, "learning_rate": 1.723045146828286e-06, "loss": 0.1639, "step": 20640, "task_loss": 0.3000491261482239 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061904087495769526, "compression/movement_sparsity/importance_threshold": -0.04217090374850685, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17913860082626343, "epoch": 7.46, "learning_rate": 1.7217366523427442e-06, "loss": 0.1631, "step": 20650, "task_loss": 0.6945570707321167 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061917297174280024, "compression/movement_sparsity/importance_threshold": -0.04199096840464489, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17457491159439087, "epoch": 7.47, "learning_rate": 1.720425573470144e-06, "loss": 0.1761, "step": 20660, "task_loss": 0.3351823091506958 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06193046922376457, "compression/movement_sparsity/importance_threshold": -0.04181154562372835, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1305508315563202, "epoch": 7.47, "learning_rate": 1.7191119149051824e-06, "loss": 0.1625, "step": 20670, "task_loss": 0.36937570571899414 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06194360369789456, "compression/movement_sparsity/importance_threshold": -0.041632634674673374, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17179100215435028, "epoch": 7.47, "learning_rate": 1.717795681351795e-06, "loss": 0.1638, "step": 20680, "task_loss": 0.37270528078079224 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061956700650341395, "compression/movement_sparsity/importance_threshold": -0.04145423482639621, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1304541826248169, "epoch": 7.48, "learning_rate": 1.716476877523137e-06, "loss": 0.1696, "step": 20690, "task_loss": 0.2637747526168823 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06196976013477651, "compression/movement_sparsity/importance_threshold": -0.04127634534781255, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1455475091934204, "epoch": 7.48, "learning_rate": 1.7151555081415668e-06, "loss": 0.171, "step": 20700, "task_loss": 0.5703924894332886 }, { "compression/movement_sparsity/importance_regularization_factor": 0.061982782204871294, "compression/movement_sparsity/importance_threshold": -0.04109896550783865, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15408097207546234, "epoch": 7.48, "learning_rate": 1.7138315779386306e-06, "loss": 0.1617, "step": 20710, "task_loss": 0.49040016531944275 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06199576691429716, "compression/movement_sparsity/importance_threshold": -0.040922094575390644, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15398012101650238, "epoch": 7.49, "learning_rate": 1.7125050916550437e-06, "loss": 0.157, "step": 20720, "task_loss": 0.18766553699970245 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062008714316725534, "compression/movement_sparsity/importance_threshold": -0.040745731819384456, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15429307520389557, "epoch": 7.49, "learning_rate": 1.711176054040674e-06, "loss": 0.1717, "step": 20730, "task_loss": 0.40597110986709595 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062021624465827804, "compression/movement_sparsity/importance_threshold": -0.040569876508736336, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12975449860095978, "epoch": 7.5, "learning_rate": 1.7098444698545262e-06, "loss": 0.1546, "step": 20740, "task_loss": 0.3054530620574951 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062034497415275405, "compression/movement_sparsity/importance_threshold": -0.0403945279123622, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17744165658950806, "epoch": 7.5, "learning_rate": 1.7085103438647223e-06, "loss": 0.1682, "step": 20750, "task_loss": 0.42306089401245117 }, { "epoch": 7.5, "eval_exact_match": 83.519394512772, "eval_f1": 89.93130104446098, "step": 20750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06204733321873974, "compression/movement_sparsity/importance_threshold": -0.040219685299178076, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16971740126609802, "epoch": 7.5, "learning_rate": 1.7071736808484873e-06, "loss": 0.1686, "step": 20760, "task_loss": 0.6131412386894226 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062060131929892196, "compression/movement_sparsity/importance_threshold": -0.04004534793810022, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11441051959991455, "epoch": 7.51, "learning_rate": 1.70583448559213e-06, "loss": 0.1578, "step": 20770, "task_loss": 0.38000303506851196 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06207289360240422, "compression/movement_sparsity/importance_threshold": -0.03987151509804454, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13867764174938202, "epoch": 7.51, "learning_rate": 1.7044927628910259e-06, "loss": 0.168, "step": 20780, "task_loss": 0.37611913681030273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0620856182899472, "compression/movement_sparsity/importance_threshold": -0.039698186047927186, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18539825081825256, "epoch": 7.51, "learning_rate": 1.7031485175496028e-06, "loss": 0.1753, "step": 20790, "task_loss": 0.3999943733215332 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06209830604619255, "compression/movement_sparsity/importance_threshold": -0.03952536005666407, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14076153934001923, "epoch": 7.52, "learning_rate": 1.7018017543813196e-06, "loss": 0.1617, "step": 20800, "task_loss": 0.5043025612831116 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06211095692481168, "compression/movement_sparsity/importance_threshold": -0.03935303639317156, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17567318677902222, "epoch": 7.52, "learning_rate": 1.7004524782086524e-06, "loss": 0.1791, "step": 20810, "task_loss": 0.38481491804122925 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062123570979476, "compression/movement_sparsity/importance_threshold": -0.039181214326365454, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14990533888339996, "epoch": 7.52, "learning_rate": 1.699100693863075e-06, "loss": 0.1671, "step": 20820, "task_loss": 0.4502103328704834 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06213614826385694, "compression/movement_sparsity/importance_threshold": -0.0390098931251619, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.19391518831253052, "epoch": 7.53, "learning_rate": 1.6977464061850425e-06, "loss": 0.1825, "step": 20830, "task_loss": 0.501262903213501 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06214868883162588, "compression/movement_sparsity/importance_threshold": -0.03883907205847703, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17029324173927307, "epoch": 7.53, "learning_rate": 1.6963896200239738e-06, "loss": 0.1572, "step": 20840, "task_loss": 0.34914785623550415 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06216119273645425, "compression/movement_sparsity/importance_threshold": -0.03866875039522688, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14604946970939636, "epoch": 7.54, "learning_rate": 1.6950303402382348e-06, "loss": 0.1772, "step": 20850, "task_loss": 0.4782135486602783 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06217366003201345, "compression/movement_sparsity/importance_threshold": -0.038498927404327477, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15026389062404633, "epoch": 7.54, "learning_rate": 1.6936685716951208e-06, "loss": 0.1653, "step": 20860, "task_loss": 0.3965921401977539 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0621860907719749, "compression/movement_sparsity/importance_threshold": -0.03832960235469485, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16776353120803833, "epoch": 7.54, "learning_rate": 1.692304319270838e-06, "loss": 0.1725, "step": 20870, "task_loss": 0.6324939131736755 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06219848501001001, "compression/movement_sparsity/importance_threshold": -0.03816077451524502, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14998579025268555, "epoch": 7.55, "learning_rate": 1.690937587850487e-06, "loss": 0.1652, "step": 20880, "task_loss": 0.5109502077102661 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06221084279979019, "compression/movement_sparsity/importance_threshold": -0.03799244315489425, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15119841694831848, "epoch": 7.55, "learning_rate": 1.6895683823280459e-06, "loss": 0.1545, "step": 20890, "task_loss": 0.3803994059562683 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06222316419498684, "compression/movement_sparsity/importance_threshold": -0.037824607542558564, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14079336822032928, "epoch": 7.55, "learning_rate": 1.6881967076063509e-06, "loss": 0.152, "step": 20900, "task_loss": 0.3902343809604645 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06223544924927138, "compression/movement_sparsity/importance_threshold": -0.03765726694715399, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18181651830673218, "epoch": 7.56, "learning_rate": 1.6868225685970807e-06, "loss": 0.1599, "step": 20910, "task_loss": 0.41076356172561646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062247698016315224, "compression/movement_sparsity/importance_threshold": -0.037490420637596444, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1383550614118576, "epoch": 7.56, "learning_rate": 1.6854459702207384e-06, "loss": 0.1609, "step": 20920, "task_loss": 0.253571093082428 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06225991054978977, "compression/movement_sparsity/importance_threshold": -0.03732406788280218, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15035834908485413, "epoch": 7.56, "learning_rate": 1.6840669174066326e-06, "loss": 0.1672, "step": 20930, "task_loss": 0.48510587215423584 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06227208690336644, "compression/movement_sparsity/importance_threshold": -0.03715820795168734, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.145534485578537, "epoch": 7.57, "learning_rate": 1.6826854150928612e-06, "loss": 0.1726, "step": 20940, "task_loss": 0.35285377502441406 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06228422713071664, "compression/movement_sparsity/importance_threshold": -0.036992840113167724, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1519620716571808, "epoch": 7.57, "learning_rate": 1.6813014682262937e-06, "loss": 0.178, "step": 20950, "task_loss": 0.32412058115005493 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062296331285511784, "compression/movement_sparsity/importance_threshold": -0.03682796363615959, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1372935175895691, "epoch": 7.57, "learning_rate": 1.6799150817625515e-06, "loss": 0.1757, "step": 20960, "task_loss": 0.3011651337146759 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06230839942142328, "compression/movement_sparsity/importance_threshold": -0.03666357778957896, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1638549268245697, "epoch": 7.58, "learning_rate": 1.6785262606659937e-06, "loss": 0.1654, "step": 20970, "task_loss": 0.6097940802574158 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062320431592122534, "compression/movement_sparsity/importance_threshold": -0.03649968184234187, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15273943543434143, "epoch": 7.58, "learning_rate": 1.6771350099096963e-06, "loss": 0.1603, "step": 20980, "task_loss": 0.48636484146118164 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062332427851280965, "compression/movement_sparsity/importance_threshold": -0.03633627506336434, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15129825472831726, "epoch": 7.59, "learning_rate": 1.6757413344754353e-06, "loss": 0.1675, "step": 20990, "task_loss": 0.3011537194252014 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06234438825256998, "compression/movement_sparsity/importance_threshold": -0.03617335672156263, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18178239464759827, "epoch": 7.59, "learning_rate": 1.674345239353669e-06, "loss": 0.1767, "step": 21000, "task_loss": 0.3912660479545593 }, { "epoch": 7.59, "eval_exact_match": 83.68968779564806, "eval_f1": 90.02347462983266, "step": 21000 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062356312849660984, "compression/movement_sparsity/importance_threshold": -0.03601092608585277, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1544092297554016, "epoch": 7.59, "learning_rate": 1.6729467295435202e-06, "loss": 0.1677, "step": 21010, "task_loss": 0.4685562252998352 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0623682016962254, "compression/movement_sparsity/importance_threshold": -0.035848982425150555, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14076654613018036, "epoch": 7.6, "learning_rate": 1.6715458100527587e-06, "loss": 0.1547, "step": 21020, "task_loss": 0.484361469745636 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062380054845934625, "compression/movement_sparsity/importance_threshold": -0.035687525008372356, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14936229586601257, "epoch": 7.6, "learning_rate": 1.6701424858977814e-06, "loss": 0.1765, "step": 21030, "task_loss": 0.4672033190727234 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06239187235246008, "compression/movement_sparsity/importance_threshold": -0.03552655310443398, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1623292863368988, "epoch": 7.6, "learning_rate": 1.668736762103598e-06, "loss": 0.1618, "step": 21040, "task_loss": 0.41577792167663574 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06240365426947318, "compression/movement_sparsity/importance_threshold": -0.03536606598225167, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14613062143325806, "epoch": 7.61, "learning_rate": 1.6673286437038083e-06, "loss": 0.1754, "step": 21050, "task_loss": 0.22772684693336487 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06241540065064532, "compression/movement_sparsity/importance_threshold": -0.035206062910741576, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17443513870239258, "epoch": 7.61, "learning_rate": 1.665918135740589e-06, "loss": 0.1671, "step": 21060, "task_loss": 0.37917977571487427 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06242711154964792, "compression/movement_sparsity/importance_threshold": -0.03504654315881961, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1646181046962738, "epoch": 7.61, "learning_rate": 1.6645052432646715e-06, "loss": 0.1775, "step": 21070, "task_loss": 0.3403213620185852 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06243878702015238, "compression/movement_sparsity/importance_threshold": -0.0348875059954018, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15902748703956604, "epoch": 7.62, "learning_rate": 1.663089971335327e-06, "loss": 0.157, "step": 21080, "task_loss": 0.3937772512435913 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06245042711583012, "compression/movement_sparsity/importance_threshold": -0.034728950689404514, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16536450386047363, "epoch": 7.62, "learning_rate": 1.661672325020346e-06, "loss": 0.1618, "step": 21090, "task_loss": 0.3008047938346863 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06246203189035256, "compression/movement_sparsity/importance_threshold": -0.03457087650974333, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16399919986724854, "epoch": 7.63, "learning_rate": 1.660252309396022e-06, "loss": 0.1695, "step": 21100, "task_loss": 0.45590633153915405 }, { "compression/movement_sparsity/importance_regularization_factor": 0.062473601397391096, "compression/movement_sparsity/importance_threshold": -0.034413282725334726, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14936356246471405, "epoch": 7.63, "learning_rate": 1.6588299295471316e-06, "loss": 0.1614, "step": 21110, "task_loss": 0.30914896726608276 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06248513569061714, "compression/movement_sparsity/importance_threshold": -0.03425616860509462, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13448117673397064, "epoch": 7.63, "learning_rate": 1.6574051905669179e-06, "loss": 0.1685, "step": 21120, "task_loss": 0.5696876049041748 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0624966348237021, "compression/movement_sparsity/importance_threshold": -0.03409953341793903, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15890441834926605, "epoch": 7.64, "learning_rate": 1.6559780975570715e-06, "loss": 0.177, "step": 21130, "task_loss": 0.29223620891571045 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0625080988503174, "compression/movement_sparsity/importance_threshold": -0.03394337643278411, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15977126359939575, "epoch": 7.64, "learning_rate": 1.6545486556277118e-06, "loss": 0.1579, "step": 21140, "task_loss": 0.42903733253479004 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06251952782413445, "compression/movement_sparsity/importance_threshold": -0.033787696918545884, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12481575459241867, "epoch": 7.64, "learning_rate": 1.6531168698973698e-06, "loss": 0.1673, "step": 21150, "task_loss": 0.4769006669521332 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06253092179882463, "compression/movement_sparsity/importance_threshold": -0.03363249414414049, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14009490609169006, "epoch": 7.65, "learning_rate": 1.6516827454929691e-06, "loss": 0.167, "step": 21160, "task_loss": 0.47750040888786316 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06254228082805939, "compression/movement_sparsity/importance_threshold": -0.033477767378483736, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13786697387695312, "epoch": 7.65, "learning_rate": 1.6502462875498072e-06, "loss": 0.1607, "step": 21170, "task_loss": 0.31475692987442017 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06255360496551013, "compression/movement_sparsity/importance_threshold": -0.033323515890492095, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13212624192237854, "epoch": 7.65, "learning_rate": 1.6488075012115372e-06, "loss": 0.1696, "step": 21180, "task_loss": 0.2864471673965454 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06256489426484825, "compression/movement_sparsity/importance_threshold": -0.03316973894908137, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1384587585926056, "epoch": 7.66, "learning_rate": 1.6473663916301506e-06, "loss": 0.1717, "step": 21190, "task_loss": 0.23976373672485352 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06257614877974516, "compression/movement_sparsity/importance_threshold": -0.0330164358231676, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1388477087020874, "epoch": 7.66, "learning_rate": 1.6459229639659574e-06, "loss": 0.1524, "step": 21200, "task_loss": 0.30393633246421814 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06258736856387229, "compression/movement_sparsity/importance_threshold": -0.03286360578166703, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1598781794309616, "epoch": 7.67, "learning_rate": 1.6444772233875686e-06, "loss": 0.1706, "step": 21210, "task_loss": 0.4479847252368927 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06259855367090102, "compression/movement_sparsity/importance_threshold": -0.03271124809349546, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15409858524799347, "epoch": 7.67, "learning_rate": 1.6430291750718763e-06, "loss": 0.1681, "step": 21220, "task_loss": 0.6857647895812988 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06260970415450279, "compression/movement_sparsity/importance_threshold": -0.03255936202756937, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.156455859541893, "epoch": 7.67, "learning_rate": 1.6415788242040375e-06, "loss": 0.1702, "step": 21230, "task_loss": 0.4796447157859802 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06262082006834899, "compression/movement_sparsity/importance_threshold": -0.03240794685280446, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17359215021133423, "epoch": 7.68, "learning_rate": 1.6401261759774529e-06, "loss": 0.17, "step": 21240, "task_loss": 0.6077663898468018 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06263190146611104, "compression/movement_sparsity/importance_threshold": -0.03225700183811686, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15661515295505524, "epoch": 7.68, "learning_rate": 1.6386712355937506e-06, "loss": 0.1798, "step": 21250, "task_loss": 0.4524965286254883 }, { "epoch": 7.68, "eval_exact_match": 83.59508041627247, "eval_f1": 89.99440399360535, "step": 21250 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06264294840146034, "compression/movement_sparsity/importance_threshold": -0.03210652625242283, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16243040561676025, "epoch": 7.68, "learning_rate": 1.6372140082627653e-06, "loss": 0.1725, "step": 21260, "task_loss": 0.7392611503601074 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06265396092806833, "compression/movement_sparsity/importance_threshold": -0.03195651936463828, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1820927858352661, "epoch": 7.69, "learning_rate": 1.6357544992025214e-06, "loss": 0.1629, "step": 21270, "task_loss": 0.7152500152587891 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06266493909960638, "compression/movement_sparsity/importance_threshold": -0.031806980443679245, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1378469318151474, "epoch": 7.69, "learning_rate": 1.6342927136392146e-06, "loss": 0.1635, "step": 21280, "task_loss": 0.4715365171432495 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06267588296974594, "compression/movement_sparsity/importance_threshold": -0.03165790875846186, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14740951359272003, "epoch": 7.69, "learning_rate": 1.6328286568071903e-06, "loss": 0.1732, "step": 21290, "task_loss": 0.5006458163261414 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0626867925921584, "compression/movement_sparsity/importance_threshold": -0.03150930357790216, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13728134334087372, "epoch": 7.7, "learning_rate": 1.6313623339489285e-06, "loss": 0.16, "step": 21300, "task_loss": 0.3875589966773987 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06269766802051516, "compression/movement_sparsity/importance_threshold": -0.03136116417091628, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.24846616387367249, "epoch": 7.7, "learning_rate": 1.6298937503150226e-06, "loss": 0.1714, "step": 21310, "task_loss": 0.4736020565032959 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06270850930848765, "compression/movement_sparsity/importance_threshold": -0.031213489806420247, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2179969847202301, "epoch": 7.71, "learning_rate": 1.6284229111641613e-06, "loss": 0.1854, "step": 21320, "task_loss": 0.5530951619148254 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06271931650974727, "compression/movement_sparsity/importance_threshold": -0.031066279753330095, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13526931405067444, "epoch": 7.71, "learning_rate": 1.6269498217631102e-06, "loss": 0.1558, "step": 21330, "task_loss": 0.42102596163749695 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06273008967796542, "compression/movement_sparsity/importance_threshold": -0.03091953328056196, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14113759994506836, "epoch": 7.71, "learning_rate": 1.6254744873866926e-06, "loss": 0.1727, "step": 21340, "task_loss": 0.277980774641037 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06274082886681355, "compression/movement_sparsity/importance_threshold": -0.030773249657031876, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13323111832141876, "epoch": 7.72, "learning_rate": 1.6239969133177703e-06, "loss": 0.182, "step": 21350, "task_loss": 0.44536280632019043 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06275153412996302, "compression/movement_sparsity/importance_threshold": -0.030627428151655756, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16452021896839142, "epoch": 7.72, "learning_rate": 1.622517104847225e-06, "loss": 0.1607, "step": 21360, "task_loss": 0.38459068536758423 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06276220552108529, "compression/movement_sparsity/importance_threshold": -0.030482068033349963, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16616493463516235, "epoch": 7.72, "learning_rate": 1.6210350672739396e-06, "loss": 0.1731, "step": 21370, "task_loss": 0.8876696825027466 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06277284309385174, "compression/movement_sparsity/importance_threshold": -0.030337168571030304, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1460474133491516, "epoch": 7.73, "learning_rate": 1.6195508059047782e-06, "loss": 0.1685, "step": 21380, "task_loss": 0.30946826934814453 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06278344690193377, "compression/movement_sparsity/importance_threshold": -0.03019272903361314, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17295223474502563, "epoch": 7.73, "learning_rate": 1.6180643260545695e-06, "loss": 0.1735, "step": 21390, "task_loss": 0.4286682903766632 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06279401699900283, "compression/movement_sparsity/importance_threshold": -0.03004874869001417, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1404990553855896, "epoch": 7.73, "learning_rate": 1.6165756330460838e-06, "loss": 0.1617, "step": 21400, "task_loss": 0.40137583017349243 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0628045534387303, "compression/movement_sparsity/importance_threshold": -0.029905226809149643, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14242419600486755, "epoch": 7.74, "learning_rate": 1.6150847322100181e-06, "loss": 0.1605, "step": 21410, "task_loss": 0.43105363845825195 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06281505627478759, "compression/movement_sparsity/importance_threshold": -0.029762162659935698, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15184879302978516, "epoch": 7.74, "learning_rate": 1.6135916288849743e-06, "loss": 0.1734, "step": 21420, "task_loss": 0.26615574955940247 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06282552556084613, "compression/movement_sparsity/importance_threshold": -0.029619555511288254, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20159095525741577, "epoch": 7.74, "learning_rate": 1.6120963284174414e-06, "loss": 0.1686, "step": 21430, "task_loss": 0.810195803642273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06283596135057731, "compression/movement_sparsity/importance_threshold": -0.02947740463212356, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13885483145713806, "epoch": 7.75, "learning_rate": 1.6105988361617753e-06, "loss": 0.17, "step": 21440, "task_loss": 0.8474996089935303 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06284636369765256, "compression/movement_sparsity/importance_threshold": -0.029335709291357537, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15829253196716309, "epoch": 7.75, "learning_rate": 1.609099157480181e-06, "loss": 0.1593, "step": 21450, "task_loss": 0.3659548759460449 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06285673265574328, "compression/movement_sparsity/importance_threshold": -0.02919446875790621, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16529636085033417, "epoch": 7.76, "learning_rate": 1.6075972977426924e-06, "loss": 0.1695, "step": 21460, "task_loss": 0.2963300943374634 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06286706827852089, "compression/movement_sparsity/importance_threshold": -0.029053682300685835, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.2159816175699234, "epoch": 7.76, "learning_rate": 1.6060932623271524e-06, "loss": 0.1615, "step": 21470, "task_loss": 0.40529030561447144 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06287737061965677, "compression/movement_sparsity/importance_threshold": -0.028913349188612325, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13636060059070587, "epoch": 7.76, "learning_rate": 1.6045870566191958e-06, "loss": 0.153, "step": 21480, "task_loss": 0.42834505438804626 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06288763973282237, "compression/movement_sparsity/importance_threshold": -0.028773468690601822, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11330045759677887, "epoch": 7.77, "learning_rate": 1.6030786860122283e-06, "loss": 0.1819, "step": 21490, "task_loss": 0.2636592984199524 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06289787567168907, "compression/movement_sparsity/importance_threshold": -0.028634040075570244, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16234830021858215, "epoch": 7.77, "learning_rate": 1.6015681559074076e-06, "loss": 0.1722, "step": 21500, "task_loss": 0.5878846049308777 }, { "epoch": 7.77, "eval_exact_match": 83.43424787133397, "eval_f1": 89.87035128509326, "step": 21500 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0629080784899283, "compression/movement_sparsity/importance_threshold": -0.028495062612433952, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1218387633562088, "epoch": 7.77, "learning_rate": 1.6000554717136239e-06, "loss": 0.153, "step": 21510, "task_loss": 0.19702383875846863 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06291824824121148, "compression/movement_sparsity/importance_threshold": -0.028356535570108754, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17990869283676147, "epoch": 7.78, "learning_rate": 1.5985406388474809e-06, "loss": 0.1776, "step": 21520, "task_loss": 0.416679322719574 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06292838497920998, "compression/movement_sparsity/importance_threshold": -0.02821845821751079, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18338486552238464, "epoch": 7.78, "learning_rate": 1.5970236627332766e-06, "loss": 0.1697, "step": 21530, "task_loss": 0.8984547853469849 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06293848875759525, "compression/movement_sparsity/importance_threshold": -0.028080829823556086, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.146068274974823, "epoch": 7.78, "learning_rate": 1.595504548802983e-06, "loss": 0.1673, "step": 21540, "task_loss": 0.5259707570075989 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06294855963003868, "compression/movement_sparsity/importance_threshold": -0.027943649657160896, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14311178028583527, "epoch": 7.79, "learning_rate": 1.5939833024962272e-06, "loss": 0.1753, "step": 21550, "task_loss": 0.3172750473022461 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0629585976502117, "compression/movement_sparsity/importance_threshold": -0.027806916987241137, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12539856135845184, "epoch": 7.79, "learning_rate": 1.5924599292602725e-06, "loss": 0.1499, "step": 21560, "task_loss": 0.5374971628189087 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06296860287178568, "compression/movement_sparsity/importance_threshold": -0.027670631082712727, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1600005328655243, "epoch": 7.8, "learning_rate": 1.5909344345499976e-06, "loss": 0.1781, "step": 21570, "task_loss": 0.37696415185928345 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06297857534843208, "compression/movement_sparsity/importance_threshold": -0.027534791212492027, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1379353404045105, "epoch": 7.8, "learning_rate": 1.5894068238278782e-06, "loss": 0.1637, "step": 21580, "task_loss": 0.4239872097969055 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0629885151338223, "compression/movement_sparsity/importance_threshold": -0.027399396645494956, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18081137537956238, "epoch": 7.8, "learning_rate": 1.5878771025639664e-06, "loss": 0.1791, "step": 21590, "task_loss": 0.387275755405426 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06299842228162772, "compression/movement_sparsity/importance_threshold": -0.02726444665063754, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14757958054542542, "epoch": 7.81, "learning_rate": 1.5863452762358725e-06, "loss": 0.1521, "step": 21600, "task_loss": 0.4220387637615204 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06300829684551976, "compression/movement_sparsity/importance_threshold": -0.027129940496836036, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13776156306266785, "epoch": 7.81, "learning_rate": 1.584811350328744e-06, "loss": 0.1666, "step": 21610, "task_loss": 0.38100963830947876 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06301813887916986, "compression/movement_sparsity/importance_threshold": -0.026995877453006356, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15775710344314575, "epoch": 7.81, "learning_rate": 1.5832753303352466e-06, "loss": 0.1679, "step": 21620, "task_loss": 0.34619224071502686 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06302794843624941, "compression/movement_sparsity/importance_threshold": -0.02686225678806442, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15650063753128052, "epoch": 7.82, "learning_rate": 1.5817372217555452e-06, "loss": 0.1709, "step": 21630, "task_loss": 0.2966180741786957 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06303772557042982, "compression/movement_sparsity/importance_threshold": -0.02672907777092659, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.20328043401241302, "epoch": 7.82, "learning_rate": 1.5801970300972825e-06, "loss": 0.1767, "step": 21640, "task_loss": 0.6341493725776672 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0630474703353825, "compression/movement_sparsity/importance_threshold": -0.026596339670508784, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16929413378238678, "epoch": 7.82, "learning_rate": 1.5786547608755604e-06, "loss": 0.1635, "step": 21650, "task_loss": 0.4473767876625061 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06305718278477886, "compression/movement_sparsity/importance_threshold": -0.02646404175572714, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14825168251991272, "epoch": 7.83, "learning_rate": 1.577110419612921e-06, "loss": 0.1699, "step": 21660, "task_loss": 0.4353300929069519 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0630668629722903, "compression/movement_sparsity/importance_threshold": -0.02633218329549769, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12816864252090454, "epoch": 7.83, "learning_rate": 1.575564011839325e-06, "loss": 0.1517, "step": 21670, "task_loss": 0.6550402641296387 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06307651095158827, "compression/movement_sparsity/importance_threshold": -0.026200763558736462, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17422786355018616, "epoch": 7.84, "learning_rate": 1.574015543092133e-06, "loss": 0.1812, "step": 21680, "task_loss": 0.496726393699646 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06308612677634413, "compression/movement_sparsity/importance_threshold": -0.026069781814359483, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18180793523788452, "epoch": 7.84, "learning_rate": 1.5724650189160866e-06, "loss": 0.1807, "step": 21690, "task_loss": 0.5905567407608032 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06309571050022933, "compression/movement_sparsity/importance_threshold": -0.025939237331283005, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14586716890335083, "epoch": 7.84, "learning_rate": 1.5709124448632855e-06, "loss": 0.1862, "step": 21700, "task_loss": 0.908523440361023 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06310526217691527, "compression/movement_sparsity/importance_threshold": -0.025809129378422835, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16504916548728943, "epoch": 7.85, "learning_rate": 1.5693578264931715e-06, "loss": 0.1572, "step": 21710, "task_loss": 0.3277358114719391 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06311478186007335, "compression/movement_sparsity/importance_threshold": -0.025679457224695224, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16319605708122253, "epoch": 7.85, "learning_rate": 1.5678011693725051e-06, "loss": 0.1624, "step": 21720, "task_loss": 0.3537963628768921 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06312426960337499, "compression/movement_sparsity/importance_threshold": -0.02555022013901631, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16419553756713867, "epoch": 7.85, "learning_rate": 1.5662424790753482e-06, "loss": 0.1614, "step": 21730, "task_loss": 0.4133787751197815 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06313372546049159, "compression/movement_sparsity/importance_threshold": -0.025421417390302015, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1400185227394104, "epoch": 7.86, "learning_rate": 1.5646817611830424e-06, "loss": 0.1625, "step": 21740, "task_loss": 0.32223424315452576 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06314314948509457, "compression/movement_sparsity/importance_threshold": -0.025293048247468364, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15030598640441895, "epoch": 7.86, "learning_rate": 1.5631190212841903e-06, "loss": 0.1713, "step": 21750, "task_loss": 0.4918508529663086 }, { "epoch": 7.86, "eval_exact_match": 83.43424787133397, "eval_f1": 89.87738840205206, "step": 21750 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06315254173085535, "compression/movement_sparsity/importance_threshold": -0.025165111979431498, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1533832848072052, "epoch": 7.86, "learning_rate": 1.5615542649746348e-06, "loss": 0.1572, "step": 21760, "task_loss": 0.3352183997631073 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06316190225144531, "compression/movement_sparsity/importance_threshold": -0.025037607855107558, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.13758356869220734, "epoch": 7.87, "learning_rate": 1.5599874978574383e-06, "loss": 0.1681, "step": 21770, "task_loss": 0.2775072455406189 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06317123110053591, "compression/movement_sparsity/importance_threshold": -0.02491053514341246, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15805354714393616, "epoch": 7.87, "learning_rate": 1.558418725542865e-06, "loss": 0.1715, "step": 21780, "task_loss": 0.3538605868816376 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0631805283317985, "compression/movement_sparsity/importance_threshold": -0.024783893113262234, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14023539423942566, "epoch": 7.87, "learning_rate": 1.5568479536483574e-06, "loss": 0.1725, "step": 21790, "task_loss": 0.438821017742157 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06318979399890454, "compression/movement_sparsity/importance_threshold": -0.024657681033573242, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1380080133676529, "epoch": 7.88, "learning_rate": 1.5552751877985198e-06, "loss": 0.1598, "step": 21800, "task_loss": 0.35417595505714417 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06319902815552542, "compression/movement_sparsity/importance_threshold": -0.02453189817326129, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1379581242799759, "epoch": 7.88, "learning_rate": 1.5537004336250953e-06, "loss": 0.1623, "step": 21810, "task_loss": 0.2302461713552475 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06320823085533255, "compression/movement_sparsity/importance_threshold": -0.02440654380124263, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17599256336688995, "epoch": 7.89, "learning_rate": 1.5521236967669476e-06, "loss": 0.1751, "step": 21820, "task_loss": 0.4087195098400116 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06321740215199734, "compression/movement_sparsity/importance_threshold": -0.02428161718643307, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15393874049186707, "epoch": 7.89, "learning_rate": 1.5505449828700391e-06, "loss": 0.1707, "step": 21830, "task_loss": 0.36279088258743286 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0632265420991912, "compression/movement_sparsity/importance_threshold": -0.024157117597748967, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1479380875825882, "epoch": 7.89, "learning_rate": 1.5489642975874122e-06, "loss": 0.1614, "step": 21840, "task_loss": 0.43778854608535767 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06323565075058554, "compression/movement_sparsity/importance_threshold": -0.024033044304106133, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.16105225682258606, "epoch": 7.9, "learning_rate": 1.5473816465791684e-06, "loss": 0.1619, "step": 21850, "task_loss": 0.3186939060688019 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06324472815985178, "compression/movement_sparsity/importance_threshold": -0.023909396574420927, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.12178893387317657, "epoch": 7.9, "learning_rate": 1.5457970355124478e-06, "loss": 0.1616, "step": 21860, "task_loss": 0.5092835426330566 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06325377438066133, "compression/movement_sparsity/importance_threshold": -0.023786173677609046, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14096394181251526, "epoch": 7.9, "learning_rate": 1.5442104700614089e-06, "loss": 0.173, "step": 21870, "task_loss": 0.4207335114479065 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06326278946668559, "compression/movement_sparsity/importance_threshold": -0.02366337488258685, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1833285242319107, "epoch": 7.91, "learning_rate": 1.542621955907209e-06, "loss": 0.1776, "step": 21880, "task_loss": 0.35541167855262756 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06327177347159599, "compression/movement_sparsity/importance_threshold": -0.02354099945827015, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1276547908782959, "epoch": 7.91, "learning_rate": 1.5410314987379826e-06, "loss": 0.1608, "step": 21890, "task_loss": 0.9517670273780823 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06328072644906392, "compression/movement_sparsity/importance_threshold": -0.023419046673575417, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17998504638671875, "epoch": 7.91, "learning_rate": 1.5394391042488227e-06, "loss": 0.162, "step": 21900, "task_loss": 0.39371180534362793 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06328964845276079, "compression/movement_sparsity/importance_threshold": -0.023297515797418344, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15145400166511536, "epoch": 7.92, "learning_rate": 1.5378447781417583e-06, "loss": 0.1493, "step": 21910, "task_loss": 0.35905247926712036 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06329853953635801, "compression/movement_sparsity/importance_threshold": -0.023176406098715185, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14053452014923096, "epoch": 7.92, "learning_rate": 1.5362485261257357e-06, "loss": 0.1702, "step": 21920, "task_loss": 0.6316653490066528 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06330739975352702, "compression/movement_sparsity/importance_threshold": -0.023055716846381857, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.1474117934703827, "epoch": 7.93, "learning_rate": 1.5346503539165975e-06, "loss": 0.1663, "step": 21930, "task_loss": 0.4833589792251587 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0633162291579392, "compression/movement_sparsity/importance_threshold": -0.0229354473093345, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.11505290865898132, "epoch": 7.93, "learning_rate": 1.5330502672370624e-06, "loss": 0.1662, "step": 21940, "task_loss": 0.22523199021816254 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06332502780326597, "compression/movement_sparsity/importance_threshold": -0.022815596756489254, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.15433210134506226, "epoch": 7.93, "learning_rate": 1.5314482718167034e-06, "loss": 0.1667, "step": 21950, "task_loss": 0.4213239252567291 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06333379574317875, "compression/movement_sparsity/importance_threshold": -0.022696164456762147, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18093153834342957, "epoch": 7.94, "learning_rate": 1.5298443733919294e-06, "loss": 0.1637, "step": 21960, "task_loss": 0.34346556663513184 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06334253303134893, "compression/movement_sparsity/importance_threshold": -0.02257714967906932, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.10571896284818649, "epoch": 7.94, "learning_rate": 1.5282385777059635e-06, "loss": 0.1562, "step": 21970, "task_loss": 0.17734256386756897 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06335123972144793, "compression/movement_sparsity/importance_threshold": -0.022458551692326578, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.18012669682502747, "epoch": 7.94, "learning_rate": 1.526630890508821e-06, "loss": 0.1653, "step": 21980, "task_loss": 0.5104833245277405 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06335991586714718, "compression/movement_sparsity/importance_threshold": -0.022340369765450174, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.17314781248569489, "epoch": 7.95, "learning_rate": 1.5250213175572921e-06, "loss": 0.1676, "step": 21990, "task_loss": 0.3881048560142517 }, { "compression/movement_sparsity/importance_regularization_factor": 0.06336856152211805, "compression/movement_sparsity/importance_threshold": -0.022222603167356247, "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, "compression/movement_sparsity/model_sparsity": 0.22154907944140886, "compression_loss": 0.0, "distillation_loss": 0.14116662740707397, "epoch": 7.95, "learning_rate": 1.523409864614919e-06, "loss": 0.1608, "step": 22000, "task_loss": 0.40728896856307983 }, { "epoch": 7.95, "eval_exact_match": 83.80321665089878, "eval_f1": 90.15605593670335, "step": 22000 } ], "max_steps": 49806, "num_train_epochs": 18, "total_flos": 1.968581126605824e+16, "trial_name": null, "trial_params": null }