diff --git "a/trainer_states.json" "b/trainer_states.json" new file mode 100644--- /dev/null +++ "b/trainer_states.json" @@ -0,0 +1,29144 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.9508492952656304, + "global_step": 22000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1391890048980713, + "epoch": 0.0, + "learning_rate": 1.9999982096052276e-06, + "loss": 0.1776, + "step": 10, + "task_loss": 0.3535611927509308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1461760699748993, + "epoch": 0.01, + "learning_rate": 1.999992838427322e-06, + "loss": 0.1969, + "step": 20, + "task_loss": 0.3184128403663635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16645857691764832, + "epoch": 0.01, + "learning_rate": 1.9999838864855164e-06, + "loss": 0.1698, + "step": 30, + "task_loss": 0.42190682888031006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1625981330871582, + "epoch": 0.01, + "learning_rate": 1.999971353811865e-06, + "loss": 0.1782, + "step": 40, + "task_loss": 0.23675988614559174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14059175550937653, + "epoch": 0.02, + "learning_rate": 1.9999552404512455e-06, + "loss": 0.1794, + "step": 50, + "task_loss": 0.3601588010787964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19353535771369934, + "epoch": 0.02, + "learning_rate": 1.9999355464613565e-06, + "loss": 0.1838, + "step": 60, + "task_loss": 0.5550195574760437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.181913822889328, + "epoch": 0.03, + "learning_rate": 1.999912271912717e-06, + "loss": 0.1859, + "step": 70, + "task_loss": 0.3225249648094177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13254797458648682, + "epoch": 0.03, + "learning_rate": 1.999885416888669e-06, + "loss": 0.1691, + "step": 80, + "task_loss": 0.2899574637413025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16020077466964722, + "epoch": 0.03, + "learning_rate": 1.999854981485375e-06, + "loss": 0.1803, + "step": 90, + "task_loss": 0.43814536929130554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16687241196632385, + "epoch": 0.04, + "learning_rate": 1.999820965811817e-06, + "loss": 0.1859, + "step": 100, + "task_loss": 0.746590256690979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.155348539352417, + "epoch": 0.04, + "learning_rate": 1.9997833699897987e-06, + "loss": 0.1793, + "step": 110, + "task_loss": 0.45200714468955994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1690862774848938, + "epoch": 0.04, + "learning_rate": 1.999742194153942e-06, + "loss": 0.1799, + "step": 120, + "task_loss": 0.2519175112247467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18965381383895874, + "epoch": 0.05, + "learning_rate": 1.99969743845169e-06, + "loss": 0.1775, + "step": 130, + "task_loss": 0.35963237285614014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14351242780685425, + "epoch": 0.05, + "learning_rate": 1.9996491030433027e-06, + "loss": 0.1842, + "step": 140, + "task_loss": 0.5129216313362122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19938349723815918, + "epoch": 0.05, + "learning_rate": 1.999597188101859e-06, + "loss": 0.1822, + "step": 150, + "task_loss": 0.6201828718185425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15174296498298645, + "epoch": 0.06, + "learning_rate": 1.9995416938132554e-06, + "loss": 0.1762, + "step": 160, + "task_loss": 0.2371373474597931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14393183588981628, + "epoch": 0.06, + "learning_rate": 1.9994826203762056e-06, + "loss": 0.1756, + "step": 170, + "task_loss": 0.3284216523170471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15572452545166016, + "epoch": 0.07, + "learning_rate": 1.9994199680022386e-06, + "loss": 0.1785, + "step": 180, + "task_loss": 0.17561133205890656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.160763680934906, + "epoch": 0.07, + "learning_rate": 1.9993537369157004e-06, + "loss": 0.1814, + "step": 190, + "task_loss": 0.30648908019065857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13948312401771545, + "epoch": 0.07, + "learning_rate": 1.9992839273537492e-06, + "loss": 0.1719, + "step": 200, + "task_loss": 0.5067750215530396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19486072659492493, + "epoch": 0.08, + "learning_rate": 1.9992105395663598e-06, + "loss": 0.1853, + "step": 210, + "task_loss": 0.4411253333091736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14867699146270752, + "epoch": 0.08, + "learning_rate": 1.999133573816317e-06, + "loss": 0.1812, + "step": 220, + "task_loss": 0.6160109639167786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14178313314914703, + "epoch": 0.08, + "learning_rate": 1.99905303037922e-06, + "loss": 0.1844, + "step": 230, + "task_loss": 0.47401952743530273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15987926721572876, + "epoch": 0.09, + "learning_rate": 1.9989689095434775e-06, + "loss": 0.174, + "step": 240, + "task_loss": 0.4866279065608978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1667332649230957, + "epoch": 0.09, + "learning_rate": 1.9988812116103086e-06, + "loss": 0.1783, + "step": 250, + "task_loss": 0.5632772445678711 + }, + { + "epoch": 0.09, + "eval_exact_match": 83.68968779564806, + "eval_f1": 90.07662178846462, + "step": 250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13907156884670258, + "epoch": 0.09, + "learning_rate": 1.998789936893741e-06, + "loss": 0.1674, + "step": 260, + "task_loss": 0.25233495235443115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12035252153873444, + "epoch": 0.1, + "learning_rate": 1.99869508572061e-06, + "loss": 0.1682, + "step": 270, + "task_loss": 0.33309412002563477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1731790155172348, + "epoch": 0.1, + "learning_rate": 1.9985966584305585e-06, + "loss": 0.18, + "step": 280, + "task_loss": 0.38126981258392334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1732906997203827, + "epoch": 0.1, + "learning_rate": 1.9984946553760333e-06, + "loss": 0.1723, + "step": 290, + "task_loss": 0.3908073902130127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17171213030815125, + "epoch": 0.11, + "learning_rate": 1.998389076922286e-06, + "loss": 0.1833, + "step": 300, + "task_loss": 0.3038800358772278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17187830805778503, + "epoch": 0.11, + "learning_rate": 1.9982799234473707e-06, + "loss": 0.1727, + "step": 310, + "task_loss": 0.5874561071395874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14011260867118835, + "epoch": 0.12, + "learning_rate": 1.998167195342143e-06, + "loss": 0.1728, + "step": 320, + "task_loss": 0.466874361038208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14882700145244598, + "epoch": 0.12, + "learning_rate": 1.998050893010259e-06, + "loss": 0.1806, + "step": 330, + "task_loss": 0.5158421993255615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2140088677406311, + "epoch": 0.12, + "learning_rate": 1.9979310168681726e-06, + "loss": 0.1776, + "step": 340, + "task_loss": 0.49200907349586487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1642945408821106, + "epoch": 0.13, + "learning_rate": 1.9978075673451348e-06, + "loss": 0.1922, + "step": 350, + "task_loss": 0.2854197919368744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12045970559120178, + "epoch": 0.13, + "learning_rate": 1.9976805448831925e-06, + "loss": 0.1795, + "step": 360, + "task_loss": 0.3827168345451355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16507384181022644, + "epoch": 0.13, + "learning_rate": 1.9975499499371862e-06, + "loss": 0.173, + "step": 370, + "task_loss": 0.22587484121322632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15693482756614685, + "epoch": 0.14, + "learning_rate": 1.99741578297475e-06, + "loss": 0.1795, + "step": 380, + "task_loss": 0.47314953804016113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15382714569568634, + "epoch": 0.14, + "learning_rate": 1.9972780444763056e-06, + "loss": 0.169, + "step": 390, + "task_loss": 0.46679824590682983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1490176022052765, + "epoch": 0.14, + "learning_rate": 1.9971367349350676e-06, + "loss": 0.169, + "step": 400, + "task_loss": 0.4132170081138611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1326831877231598, + "epoch": 0.15, + "learning_rate": 1.9969918548570343e-06, + "loss": 0.1712, + "step": 410, + "task_loss": 0.1556464433670044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17481349408626556, + "epoch": 0.15, + "learning_rate": 1.9968434047609913e-06, + "loss": 0.1751, + "step": 420, + "task_loss": 0.3467264771461487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1614687293767929, + "epoch": 0.16, + "learning_rate": 1.9966913851785074e-06, + "loss": 0.1698, + "step": 430, + "task_loss": 0.38117820024490356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19468063116073608, + "epoch": 0.16, + "learning_rate": 1.996535796653933e-06, + "loss": 0.1758, + "step": 440, + "task_loss": 0.30559083819389343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18599817156791687, + "epoch": 0.16, + "learning_rate": 1.996376639744396e-06, + "loss": 0.18, + "step": 450, + "task_loss": 0.5101648569107056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14922016859054565, + "epoch": 0.17, + "learning_rate": 1.996213915019806e-06, + "loss": 0.1767, + "step": 460, + "task_loss": 0.39836177229881287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.133830264210701, + "epoch": 0.17, + "learning_rate": 1.9960476230628453e-06, + "loss": 0.1811, + "step": 470, + "task_loss": 0.5501172542572021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15386879444122314, + "epoch": 0.17, + "learning_rate": 1.9958777644689696e-06, + "loss": 0.1752, + "step": 480, + "task_loss": 0.7182563543319702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17507609724998474, + "epoch": 0.18, + "learning_rate": 1.995704339846408e-06, + "loss": 0.1801, + "step": 490, + "task_loss": 0.40363389253616333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17033545672893524, + "epoch": 0.18, + "learning_rate": 1.9955273498161563e-06, + "loss": 0.1853, + "step": 500, + "task_loss": 0.5597988367080688 + }, + { + "epoch": 0.18, + "eval_exact_match": 83.69914853358561, + "eval_f1": 90.06682101600445, + "step": 500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16981837153434753, + "epoch": 0.18, + "learning_rate": 1.9953467950119794e-06, + "loss": 0.1718, + "step": 510, + "task_loss": 0.34875768423080444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12693539261817932, + "epoch": 0.19, + "learning_rate": 1.9951626760804064e-06, + "loss": 0.1762, + "step": 520, + "task_loss": 0.3869848847389221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14310693740844727, + "epoch": 0.19, + "learning_rate": 1.9949749936807275e-06, + "loss": 0.1714, + "step": 530, + "task_loss": 0.3260875344276428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1642155647277832, + "epoch": 0.2, + "learning_rate": 1.9947837484849944e-06, + "loss": 0.1753, + "step": 540, + "task_loss": 0.25048545002937317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16132649779319763, + "epoch": 0.2, + "learning_rate": 1.9945889411780158e-06, + "loss": 0.1722, + "step": 550, + "task_loss": 0.34237614274024963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1687171906232834, + "epoch": 0.2, + "learning_rate": 1.9943905724573555e-06, + "loss": 0.162, + "step": 560, + "task_loss": 0.3560226559638977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15636363625526428, + "epoch": 0.21, + "learning_rate": 1.99418864303333e-06, + "loss": 0.1733, + "step": 570, + "task_loss": 0.559543788433075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16653823852539062, + "epoch": 0.21, + "learning_rate": 1.993983153629007e-06, + "loss": 0.1774, + "step": 580, + "task_loss": 0.36561131477355957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14254853129386902, + "epoch": 0.21, + "learning_rate": 1.9937741049802e-06, + "loss": 0.1763, + "step": 590, + "task_loss": 0.3504565358161926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14156261086463928, + "epoch": 0.22, + "learning_rate": 1.9935614978354687e-06, + "loss": 0.171, + "step": 600, + "task_loss": 0.4731927812099457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.24176684021949768, + "epoch": 0.22, + "learning_rate": 1.993345332956114e-06, + "loss": 0.18, + "step": 610, + "task_loss": 0.44777655601501465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18350887298583984, + "epoch": 0.22, + "learning_rate": 1.9931256111161768e-06, + "loss": 0.1735, + "step": 620, + "task_loss": 0.38023048639297485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1785336434841156, + "epoch": 0.23, + "learning_rate": 1.9929023331024354e-06, + "loss": 0.1838, + "step": 630, + "task_loss": 0.4373171329498291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11950236558914185, + "epoch": 0.23, + "learning_rate": 1.992675499714401e-06, + "loss": 0.1823, + "step": 640, + "task_loss": 0.4059186577796936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1564953625202179, + "epoch": 0.23, + "learning_rate": 1.992445111764316e-06, + "loss": 0.1766, + "step": 650, + "task_loss": 0.41068634390830994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17433582246303558, + "epoch": 0.24, + "learning_rate": 1.9922111700771514e-06, + "loss": 0.1789, + "step": 660, + "task_loss": 0.3949933350086212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16213908791542053, + "epoch": 0.24, + "learning_rate": 1.9919736754906037e-06, + "loss": 0.1664, + "step": 670, + "task_loss": 0.4205024242401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19312430918216705, + "epoch": 0.25, + "learning_rate": 1.99173262885509e-06, + "loss": 0.1738, + "step": 680, + "task_loss": 0.38273149728775024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15186524391174316, + "epoch": 0.25, + "learning_rate": 1.991488031033748e-06, + "loss": 0.1775, + "step": 690, + "task_loss": 0.43976613879203796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16413924098014832, + "epoch": 0.25, + "learning_rate": 1.9912398829024316e-06, + "loss": 0.1846, + "step": 700, + "task_loss": 0.4296082854270935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1467546969652176, + "epoch": 0.26, + "learning_rate": 1.9909881853497063e-06, + "loss": 0.1829, + "step": 710, + "task_loss": 0.2708396017551422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14315572381019592, + "epoch": 0.26, + "learning_rate": 1.990732939276848e-06, + "loss": 0.1761, + "step": 720, + "task_loss": 0.28182071447372437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20183299481868744, + "epoch": 0.26, + "learning_rate": 1.9904741455978396e-06, + "loss": 0.1863, + "step": 730, + "task_loss": 1.0296525955200195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16263312101364136, + "epoch": 0.27, + "learning_rate": 1.990211805239367e-06, + "loss": 0.1836, + "step": 740, + "task_loss": 0.7796942591667175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17141658067703247, + "epoch": 0.27, + "learning_rate": 1.989945919140815e-06, + "loss": 0.1751, + "step": 750, + "task_loss": 0.40661606192588806 + }, + { + "epoch": 0.27, + "eval_exact_match": 83.57615894039735, + "eval_f1": 89.9443629076221, + "step": 750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20380395650863647, + "epoch": 0.27, + "learning_rate": 1.9896764882542666e-06, + "loss": 0.173, + "step": 760, + "task_loss": 0.2610260844230652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1519322693347931, + "epoch": 0.28, + "learning_rate": 1.9894035135444964e-06, + "loss": 0.1762, + "step": 770, + "task_loss": 0.4701826870441437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16315855085849762, + "epoch": 0.28, + "learning_rate": 1.9891269959889698e-06, + "loss": 0.1768, + "step": 780, + "task_loss": 0.4588128626346588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2139987349510193, + "epoch": 0.29, + "learning_rate": 1.988846936577838e-06, + "loss": 0.1711, + "step": 790, + "task_loss": 0.509343147277832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14751572906970978, + "epoch": 0.29, + "learning_rate": 1.9885633363139344e-06, + "loss": 0.1943, + "step": 800, + "task_loss": 0.571614146232605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16162118315696716, + "epoch": 0.29, + "learning_rate": 1.9882761962127727e-06, + "loss": 0.1629, + "step": 810, + "task_loss": 0.3844223618507385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1602107584476471, + "epoch": 0.3, + "learning_rate": 1.9879855173025404e-06, + "loss": 0.1685, + "step": 820, + "task_loss": 0.3083072602748871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14762470126152039, + "epoch": 0.3, + "learning_rate": 1.9876913006240975e-06, + "loss": 0.1686, + "step": 830, + "task_loss": 0.421251118183136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1615067422389984, + "epoch": 0.3, + "learning_rate": 1.9873935472309726e-06, + "loss": 0.1734, + "step": 840, + "task_loss": 0.3794122338294983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14823423326015472, + "epoch": 0.31, + "learning_rate": 1.9870922581893573e-06, + "loss": 0.1748, + "step": 850, + "task_loss": 0.3405918478965759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1978568136692047, + "epoch": 0.31, + "learning_rate": 1.9867874345781048e-06, + "loss": 0.1775, + "step": 860, + "task_loss": 0.5284197330474854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22010980546474457, + "epoch": 0.31, + "learning_rate": 1.9864790774887234e-06, + "loss": 0.1765, + "step": 870, + "task_loss": 0.35552144050598145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17657122015953064, + "epoch": 0.32, + "learning_rate": 1.986167188025376e-06, + "loss": 0.1685, + "step": 880, + "task_loss": 0.3586549460887909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15283505618572235, + "epoch": 0.32, + "learning_rate": 1.985851767304873e-06, + "loss": 0.1852, + "step": 890, + "task_loss": 0.48049455881118774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17572566866874695, + "epoch": 0.33, + "learning_rate": 1.985532816456669e-06, + "loss": 0.1661, + "step": 900, + "task_loss": 0.4304676055908203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12883397936820984, + "epoch": 0.33, + "learning_rate": 1.98521033662286e-06, + "loss": 0.1767, + "step": 910, + "task_loss": 0.37426260113716125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14257290959358215, + "epoch": 0.33, + "learning_rate": 1.984884328958179e-06, + "loss": 0.191, + "step": 920, + "task_loss": 0.6537871360778809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14154499769210815, + "epoch": 0.34, + "learning_rate": 1.9845547946299902e-06, + "loss": 0.1865, + "step": 930, + "task_loss": 0.5936552286148071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1575993001461029, + "epoch": 0.34, + "learning_rate": 1.984221734818287e-06, + "loss": 0.169, + "step": 940, + "task_loss": 0.42563629150390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15309542417526245, + "epoch": 0.34, + "learning_rate": 1.9838851507156864e-06, + "loss": 0.1771, + "step": 950, + "task_loss": 0.5308173894882202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1929389238357544, + "epoch": 0.35, + "learning_rate": 1.983545043527425e-06, + "loss": 0.1703, + "step": 960, + "task_loss": 0.5199047923088074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15166574716567993, + "epoch": 0.35, + "learning_rate": 1.9832014144713554e-06, + "loss": 0.1738, + "step": 970, + "task_loss": 0.3604187071323395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14696621894836426, + "epoch": 0.35, + "learning_rate": 1.9828542647779415e-06, + "loss": 0.1776, + "step": 980, + "task_loss": 0.3480679392814636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17910030484199524, + "epoch": 0.36, + "learning_rate": 1.9825035956902515e-06, + "loss": 0.184, + "step": 990, + "task_loss": 0.3781590461730957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18165017664432526, + "epoch": 0.36, + "learning_rate": 1.9821494084639595e-06, + "loss": 0.1759, + "step": 1000, + "task_loss": 0.5116103887557983 + }, + { + "epoch": 0.36, + "eval_exact_match": 83.45316934720908, + "eval_f1": 89.78098563856513, + "step": 1000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17691287398338318, + "epoch": 0.37, + "learning_rate": 1.9817917043673343e-06, + "loss": 0.1735, + "step": 1010, + "task_loss": 0.5271925330162048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17111527919769287, + "epoch": 0.37, + "learning_rate": 1.9814304846812396e-06, + "loss": 0.1766, + "step": 1020, + "task_loss": 0.28790348768234253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1404891461133957, + "epoch": 0.37, + "learning_rate": 1.981065750699127e-06, + "loss": 0.1743, + "step": 1030, + "task_loss": 0.565461277961731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.121620774269104, + "epoch": 0.38, + "learning_rate": 1.980697503727031e-06, + "loss": 0.1746, + "step": 1040, + "task_loss": 0.3549140691757202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13501675426959991, + "epoch": 0.38, + "learning_rate": 1.9803257450835683e-06, + "loss": 0.172, + "step": 1050, + "task_loss": 0.6684577465057373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.137796550989151, + "epoch": 0.38, + "learning_rate": 1.9799504760999275e-06, + "loss": 0.1804, + "step": 1060, + "task_loss": 0.3713378310203552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14667566120624542, + "epoch": 0.39, + "learning_rate": 1.9795716981198676e-06, + "loss": 0.1728, + "step": 1070, + "task_loss": 0.35704660415649414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21063314378261566, + "epoch": 0.39, + "learning_rate": 1.979189412499713e-06, + "loss": 0.194, + "step": 1080, + "task_loss": 0.5208743810653687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13344134390354156, + "epoch": 0.39, + "learning_rate": 1.9788036206083484e-06, + "loss": 0.1723, + "step": 1090, + "task_loss": 0.27669817209243774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17827028036117554, + "epoch": 0.4, + "learning_rate": 1.9784143238272128e-06, + "loss": 0.1615, + "step": 1100, + "task_loss": 0.3210203945636749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16757118701934814, + "epoch": 0.4, + "learning_rate": 1.9780215235502968e-06, + "loss": 0.1686, + "step": 1110, + "task_loss": 0.43170055747032166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15356716513633728, + "epoch": 0.4, + "learning_rate": 1.9776252211841346e-06, + "loss": 0.1724, + "step": 1120, + "task_loss": 0.3005247116088867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16435839235782623, + "epoch": 0.41, + "learning_rate": 1.977225418147802e-06, + "loss": 0.1757, + "step": 1130, + "task_loss": 0.28998297452926636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.09947463124990463, + "epoch": 0.41, + "learning_rate": 1.97682211587291e-06, + "loss": 0.1595, + "step": 1140, + "task_loss": 0.36723411083221436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18333488702774048, + "epoch": 0.42, + "learning_rate": 1.976415315803599e-06, + "loss": 0.1878, + "step": 1150, + "task_loss": 0.42832082509994507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13132742047309875, + "epoch": 0.42, + "learning_rate": 1.9760050193965333e-06, + "loss": 0.174, + "step": 1160, + "task_loss": 0.3157771825790405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.155666783452034, + "epoch": 0.42, + "learning_rate": 1.9755912281208997e-06, + "loss": 0.1633, + "step": 1170, + "task_loss": 0.30877023935317993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2435683161020279, + "epoch": 0.43, + "learning_rate": 1.9751739434583966e-06, + "loss": 0.1782, + "step": 1180, + "task_loss": 0.35170674324035645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1864657998085022, + "epoch": 0.43, + "learning_rate": 1.9747531669032326e-06, + "loss": 0.1886, + "step": 1190, + "task_loss": 0.504147469997406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14224031567573547, + "epoch": 0.43, + "learning_rate": 1.97432889996212e-06, + "loss": 0.1674, + "step": 1200, + "task_loss": 0.47088128328323364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13773639500141144, + "epoch": 0.44, + "learning_rate": 1.9739011441542703e-06, + "loss": 0.1722, + "step": 1210, + "task_loss": 0.41278308629989624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18634013831615448, + "epoch": 0.44, + "learning_rate": 1.973469901011386e-06, + "loss": 0.1812, + "step": 1220, + "task_loss": 0.6850751042366028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17051789164543152, + "epoch": 0.44, + "learning_rate": 1.973035172077658e-06, + "loss": 0.1677, + "step": 1230, + "task_loss": 0.41830068826675415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1463625729084015, + "epoch": 0.45, + "learning_rate": 1.97259695890976e-06, + "loss": 0.1775, + "step": 1240, + "task_loss": 0.5584670305252075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1390727460384369, + "epoch": 0.45, + "learning_rate": 1.9721552630768407e-06, + "loss": 0.1766, + "step": 1250, + "task_loss": 0.5843634605407715 + }, + { + "epoch": 0.45, + "eval_exact_match": 83.57615894039735, + "eval_f1": 89.93850085654329, + "step": 1250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14995011687278748, + "epoch": 0.46, + "learning_rate": 1.9717100861605196e-06, + "loss": 0.1696, + "step": 1260, + "task_loss": 0.7748905420303345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.10256899893283844, + "epoch": 0.46, + "learning_rate": 1.971261429754882e-06, + "loss": 0.1758, + "step": 1270, + "task_loss": 0.14379727840423584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13538025319576263, + "epoch": 0.46, + "learning_rate": 1.970809295466472e-06, + "loss": 0.1922, + "step": 1280, + "task_loss": 0.4599601924419403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1672298014163971, + "epoch": 0.47, + "learning_rate": 1.9703536849142864e-06, + "loss": 0.1801, + "step": 1290, + "task_loss": 0.364857017993927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13718703389167786, + "epoch": 0.47, + "learning_rate": 1.9698945997297722e-06, + "loss": 0.1898, + "step": 1300, + "task_loss": 0.32751551270484924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15767154097557068, + "epoch": 0.47, + "learning_rate": 1.969432041556816e-06, + "loss": 0.191, + "step": 1310, + "task_loss": 0.5771661400794983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16633427143096924, + "epoch": 0.48, + "learning_rate": 1.968966012051741e-06, + "loss": 0.1904, + "step": 1320, + "task_loss": 0.20087338984012604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18319915235042572, + "epoch": 0.48, + "learning_rate": 1.9684965128833016e-06, + "loss": 0.1653, + "step": 1330, + "task_loss": 0.3794475197792053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14207670092582703, + "epoch": 0.48, + "learning_rate": 1.968023545732675e-06, + "loss": 0.1643, + "step": 1340, + "task_loss": 0.44781196117401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18061572313308716, + "epoch": 0.49, + "learning_rate": 1.967547112293457e-06, + "loss": 0.1737, + "step": 1350, + "task_loss": 0.39925694465637207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12682190537452698, + "epoch": 0.49, + "learning_rate": 1.967067214271656e-06, + "loss": 0.1698, + "step": 1360, + "task_loss": 0.3139961063861847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17150625586509705, + "epoch": 0.5, + "learning_rate": 1.966583853385685e-06, + "loss": 0.1768, + "step": 1370, + "task_loss": 0.6008814573287964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1599908322095871, + "epoch": 0.5, + "learning_rate": 1.9660970313663583e-06, + "loss": 0.171, + "step": 1380, + "task_loss": 0.4636422395706177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14392361044883728, + "epoch": 0.5, + "learning_rate": 1.9656067499568826e-06, + "loss": 0.1899, + "step": 1390, + "task_loss": 0.3466000258922577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15924058854579926, + "epoch": 0.51, + "learning_rate": 1.965113010912853e-06, + "loss": 0.1803, + "step": 1400, + "task_loss": 0.5084146857261658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16537730395793915, + "epoch": 0.51, + "learning_rate": 1.964615816002244e-06, + "loss": 0.1729, + "step": 1410, + "task_loss": 0.5999622344970703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16629469394683838, + "epoch": 0.51, + "learning_rate": 1.9641151670054075e-06, + "loss": 0.1845, + "step": 1420, + "task_loss": 0.32429054379463196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16834649443626404, + "epoch": 0.52, + "learning_rate": 1.963611065715061e-06, + "loss": 0.1705, + "step": 1430, + "task_loss": 0.21305644512176514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1554267406463623, + "epoch": 0.52, + "learning_rate": 1.963103513936286e-06, + "loss": 0.1904, + "step": 1440, + "task_loss": 0.22570659220218658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20235656201839447, + "epoch": 0.52, + "learning_rate": 1.9625925134865174e-06, + "loss": 0.1898, + "step": 1450, + "task_loss": 0.3236815333366394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15887069702148438, + "epoch": 0.53, + "learning_rate": 1.9620780661955414e-06, + "loss": 0.1798, + "step": 1460, + "task_loss": 0.6010942459106445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15550808608531952, + "epoch": 0.53, + "learning_rate": 1.961560173905485e-06, + "loss": 0.1817, + "step": 1470, + "task_loss": 0.18300172686576843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1984330117702484, + "epoch": 0.53, + "learning_rate": 1.961038838470812e-06, + "loss": 0.1697, + "step": 1480, + "task_loss": 0.5193182826042175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15731069445610046, + "epoch": 0.54, + "learning_rate": 1.9605140617583136e-06, + "loss": 0.1775, + "step": 1490, + "task_loss": 0.6436678171157837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18328088521957397, + "epoch": 0.54, + "learning_rate": 1.959985845647106e-06, + "loss": 0.1778, + "step": 1500, + "task_loss": 0.4055144190788269 + }, + { + "epoch": 0.54, + "eval_exact_match": 83.79375591296122, + "eval_f1": 90.07183397891889, + "step": 1500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.28984278440475464, + "epoch": 0.55, + "learning_rate": 1.95945419202862e-06, + "loss": 0.1925, + "step": 1510, + "task_loss": 0.5853183269500732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15279538929462433, + "epoch": 0.55, + "learning_rate": 1.9589191028065944e-06, + "loss": 0.1724, + "step": 1520, + "task_loss": 0.1796664148569107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1945275217294693, + "epoch": 0.55, + "learning_rate": 1.958380579897072e-06, + "loss": 0.1913, + "step": 1530, + "task_loss": 1.093052625656128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19740872085094452, + "epoch": 0.56, + "learning_rate": 1.9578386252283893e-06, + "loss": 0.1837, + "step": 1540, + "task_loss": 0.2792867422103882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16871120035648346, + "epoch": 0.56, + "learning_rate": 1.9572932407411715e-06, + "loss": 0.1797, + "step": 1550, + "task_loss": 0.3847885727882385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15976202487945557, + "epoch": 0.56, + "learning_rate": 1.9567444283883274e-06, + "loss": 0.1712, + "step": 1560, + "task_loss": 0.4087778925895691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17347773909568787, + "epoch": 0.57, + "learning_rate": 1.956192190135037e-06, + "loss": 0.1796, + "step": 1570, + "task_loss": 0.6276683211326599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1492072194814682, + "epoch": 0.57, + "learning_rate": 1.95563652795875e-06, + "loss": 0.188, + "step": 1580, + "task_loss": 0.406479150056839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15427610278129578, + "epoch": 0.57, + "learning_rate": 1.955077443849175e-06, + "loss": 0.1677, + "step": 1590, + "task_loss": 0.21835781633853912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17291224002838135, + "epoch": 0.58, + "learning_rate": 1.954514939808275e-06, + "loss": 0.1772, + "step": 1600, + "task_loss": 0.5329791903495789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17131741344928741, + "epoch": 0.58, + "learning_rate": 1.9539490178502587e-06, + "loss": 0.1668, + "step": 1610, + "task_loss": 0.42593175172805786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1412690430879593, + "epoch": 0.59, + "learning_rate": 1.9533796800015736e-06, + "loss": 0.188, + "step": 1620, + "task_loss": 0.4295274317264557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1931207925081253, + "epoch": 0.59, + "learning_rate": 1.952806928300898e-06, + "loss": 0.1789, + "step": 1630, + "task_loss": 0.33001917600631714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15779228508472443, + "epoch": 0.59, + "learning_rate": 1.9522307647991365e-06, + "loss": 0.1735, + "step": 1640, + "task_loss": 0.35899198055267334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17678138613700867, + "epoch": 0.6, + "learning_rate": 1.951651191559408e-06, + "loss": 0.173, + "step": 1650, + "task_loss": 0.30126041173934937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14263349771499634, + "epoch": 0.6, + "learning_rate": 1.951068210657043e-06, + "loss": 0.1911, + "step": 1660, + "task_loss": 0.3900689482688904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14301198720932007, + "epoch": 0.6, + "learning_rate": 1.9504818241795735e-06, + "loss": 0.1635, + "step": 1670, + "task_loss": 0.27476924657821655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1587420105934143, + "epoch": 0.61, + "learning_rate": 1.9498920342267256e-06, + "loss": 0.177, + "step": 1680, + "task_loss": 0.5345589518547058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18046513199806213, + "epoch": 0.61, + "learning_rate": 1.949298842910413e-06, + "loss": 0.1811, + "step": 1690, + "task_loss": 0.5509105324745178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11487637460231781, + "epoch": 0.61, + "learning_rate": 1.9487022523547296e-06, + "loss": 0.1728, + "step": 1700, + "task_loss": 0.3565746545791626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1578724980354309, + "epoch": 0.62, + "learning_rate": 1.9481022646959403e-06, + "loss": 0.1859, + "step": 1710, + "task_loss": 0.42493292689323425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2012399435043335, + "epoch": 0.62, + "learning_rate": 1.9474988820824743e-06, + "loss": 0.1704, + "step": 1720, + "task_loss": 0.4968900978565216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1466311812400818, + "epoch": 0.63, + "learning_rate": 1.946892106674918e-06, + "loss": 0.1922, + "step": 1730, + "task_loss": 0.3440721035003662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18941572308540344, + "epoch": 0.63, + "learning_rate": 1.9462819406460066e-06, + "loss": 0.1898, + "step": 1740, + "task_loss": 0.7371132373809814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14826852083206177, + "epoch": 0.63, + "learning_rate": 1.945668386180616e-06, + "loss": 0.1803, + "step": 1750, + "task_loss": 0.34923413395881653 + }, + { + "epoch": 0.63, + "eval_exact_match": 83.75591296121098, + "eval_f1": 90.08127134672708, + "step": 1750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12791767716407776, + "epoch": 0.64, + "learning_rate": 1.9450514454757557e-06, + "loss": 0.1798, + "step": 1760, + "task_loss": 0.26741665601730347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20207276940345764, + "epoch": 0.64, + "learning_rate": 1.9444311207405607e-06, + "loss": 0.1768, + "step": 1770, + "task_loss": 0.4358224868774414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14697374403476715, + "epoch": 0.64, + "learning_rate": 1.943807414196283e-06, + "loss": 0.1722, + "step": 1780, + "task_loss": 0.4396517276763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14546442031860352, + "epoch": 0.65, + "learning_rate": 1.9431803280762847e-06, + "loss": 0.1789, + "step": 1790, + "task_loss": 0.4033772945404053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18037843704223633, + "epoch": 0.65, + "learning_rate": 1.942549864626029e-06, + "loss": 0.1731, + "step": 1800, + "task_loss": 0.17874035239219666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1852273941040039, + "epoch": 0.65, + "learning_rate": 1.9419160261030732e-06, + "loss": 0.1759, + "step": 1810, + "task_loss": 0.7803164720535278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18808463215827942, + "epoch": 0.66, + "learning_rate": 1.941278814777059e-06, + "loss": 0.1729, + "step": 1820, + "task_loss": 0.2709602415561676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15545278787612915, + "epoch": 0.66, + "learning_rate": 1.940638232929707e-06, + "loss": 0.1833, + "step": 1830, + "task_loss": 0.5003842115402222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1838669627904892, + "epoch": 0.66, + "learning_rate": 1.939994282854805e-06, + "loss": 0.1838, + "step": 1840, + "task_loss": 0.5018002986907959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1540856808423996, + "epoch": 0.67, + "learning_rate": 1.9393469668582037e-06, + "loss": 0.188, + "step": 1850, + "task_loss": 0.4272739291191101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18242205679416656, + "epoch": 0.67, + "learning_rate": 1.9386962872578046e-06, + "loss": 0.1845, + "step": 1860, + "task_loss": 0.6574127674102783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16230648756027222, + "epoch": 0.68, + "learning_rate": 1.938042246383555e-06, + "loss": 0.179, + "step": 1870, + "task_loss": 0.3625655174255371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16737070679664612, + "epoch": 0.68, + "learning_rate": 1.9373848465774373e-06, + "loss": 0.1707, + "step": 1880, + "task_loss": 0.3662741184234619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14946278929710388, + "epoch": 0.68, + "learning_rate": 1.936724090193462e-06, + "loss": 0.1815, + "step": 1890, + "task_loss": 0.4266752600669861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14057648181915283, + "epoch": 0.69, + "learning_rate": 1.936059979597658e-06, + "loss": 0.1713, + "step": 1900, + "task_loss": 0.19198694825172424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13985538482666016, + "epoch": 0.69, + "learning_rate": 1.9353925171680666e-06, + "loss": 0.1662, + "step": 1910, + "task_loss": 0.3392411172389984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16503742337226868, + "epoch": 0.69, + "learning_rate": 1.93472170529473e-06, + "loss": 0.1854, + "step": 1920, + "task_loss": 0.4105537533760071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16417258977890015, + "epoch": 0.7, + "learning_rate": 1.9340475463796833e-06, + "loss": 0.1803, + "step": 1930, + "task_loss": 0.3279024660587311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1898116171360016, + "epoch": 0.7, + "learning_rate": 1.9333700428369494e-06, + "loss": 0.1849, + "step": 1940, + "task_loss": 0.6425855755805969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15912674367427826, + "epoch": 0.7, + "learning_rate": 1.9326891970925246e-06, + "loss": 0.1801, + "step": 1950, + "task_loss": 0.3348599970340729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22277340292930603, + "epoch": 0.71, + "learning_rate": 1.9320050115843748e-06, + "loss": 0.1904, + "step": 1960, + "task_loss": 0.525826096534729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19547489285469055, + "epoch": 0.71, + "learning_rate": 1.9313174887624245e-06, + "loss": 0.1759, + "step": 1970, + "task_loss": 0.47217780351638794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16027839481830597, + "epoch": 0.72, + "learning_rate": 1.930626631088548e-06, + "loss": 0.1846, + "step": 1980, + "task_loss": 0.3707219362258911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2344164252281189, + "epoch": 0.72, + "learning_rate": 1.9299324410365607e-06, + "loss": 0.1822, + "step": 1990, + "task_loss": 0.44019412994384766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1779266893863678, + "epoch": 0.72, + "learning_rate": 1.9292349210922114e-06, + "loss": 0.1815, + "step": 2000, + "task_loss": 0.2988145351409912 + }, + { + "epoch": 0.72, + "eval_exact_match": 83.75591296121098, + "eval_f1": 90.10693629002859, + "step": 2000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19847017526626587, + "epoch": 0.73, + "learning_rate": 1.928534073753173e-06, + "loss": 0.1785, + "step": 2010, + "task_loss": 0.42653709650039673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15378426015377045, + "epoch": 0.73, + "learning_rate": 1.9278299015290313e-06, + "loss": 0.1681, + "step": 2020, + "task_loss": 0.3308219909667969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16970130801200867, + "epoch": 0.73, + "learning_rate": 1.9271224069412792e-06, + "loss": 0.1797, + "step": 2030, + "task_loss": 0.5738540887832642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1360698640346527, + "epoch": 0.74, + "learning_rate": 1.9264115925233063e-06, + "loss": 0.1678, + "step": 2040, + "task_loss": 0.2731480300426483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14180555939674377, + "epoch": 0.74, + "learning_rate": 1.925697460820389e-06, + "loss": 0.1714, + "step": 2050, + "task_loss": 0.4451160132884979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16470122337341309, + "epoch": 0.74, + "learning_rate": 1.9249800143896825e-06, + "loss": 0.1865, + "step": 2060, + "task_loss": 0.9461302161216736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13804659247398376, + "epoch": 0.75, + "learning_rate": 1.9242592558002116e-06, + "loss": 0.1804, + "step": 2070, + "task_loss": 0.5417971611022949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20538190007209778, + "epoch": 0.75, + "learning_rate": 1.9235351876328612e-06, + "loss": 0.1716, + "step": 2080, + "task_loss": 0.468280553817749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17718102037906647, + "epoch": 0.76, + "learning_rate": 1.9228078124803676e-06, + "loss": 0.1694, + "step": 2090, + "task_loss": 0.4932321608066559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1479235589504242, + "epoch": 0.76, + "learning_rate": 1.922077132947307e-06, + "loss": 0.1831, + "step": 2100, + "task_loss": 0.3168169856071472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22004127502441406, + "epoch": 0.76, + "learning_rate": 1.9213431516500902e-06, + "loss": 0.1788, + "step": 2110, + "task_loss": 0.5075056552886963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16012755036354065, + "epoch": 0.77, + "learning_rate": 1.920605871216949e-06, + "loss": 0.1779, + "step": 2120, + "task_loss": 0.4837586581707001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15686213970184326, + "epoch": 0.77, + "learning_rate": 1.919865294287929e-06, + "loss": 0.1832, + "step": 2130, + "task_loss": 0.3587515950202942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15825381875038147, + "epoch": 0.77, + "learning_rate": 1.919121423514882e-06, + "loss": 0.1663, + "step": 2140, + "task_loss": 0.27808576822280884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16749948263168335, + "epoch": 0.78, + "learning_rate": 1.918374261561451e-06, + "loss": 0.1727, + "step": 2150, + "task_loss": 0.3993534445762634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14597871899604797, + "epoch": 0.78, + "learning_rate": 1.9176238111030663e-06, + "loss": 0.1674, + "step": 2160, + "task_loss": 0.2307223528623581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15081357955932617, + "epoch": 0.78, + "learning_rate": 1.9168700748269336e-06, + "loss": 0.1965, + "step": 2170, + "task_loss": 0.2585129141807556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16626910865306854, + "epoch": 0.79, + "learning_rate": 1.916113055432023e-06, + "loss": 0.1749, + "step": 2180, + "task_loss": 0.3819560408592224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.161339670419693, + "epoch": 0.79, + "learning_rate": 1.915352755629062e-06, + "loss": 0.1702, + "step": 2190, + "task_loss": 0.4928886592388153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15861815214157104, + "epoch": 0.8, + "learning_rate": 1.9145891781405242e-06, + "loss": 0.1737, + "step": 2200, + "task_loss": 0.4635201096534729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16528362035751343, + "epoch": 0.8, + "learning_rate": 1.91382232570062e-06, + "loss": 0.1802, + "step": 2210, + "task_loss": 0.5430650115013123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13787183165550232, + "epoch": 0.8, + "learning_rate": 1.9130522010552868e-06, + "loss": 0.1701, + "step": 2220, + "task_loss": 0.24549484252929688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13275370001792908, + "epoch": 0.81, + "learning_rate": 1.9122788069621785e-06, + "loss": 0.1586, + "step": 2230, + "task_loss": 0.3313102722167969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21796873211860657, + "epoch": 0.81, + "learning_rate": 1.9115021461906563e-06, + "loss": 0.1696, + "step": 2240, + "task_loss": 0.6265380382537842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1380050778388977, + "epoch": 0.81, + "learning_rate": 1.9107222215217797e-06, + "loss": 0.1662, + "step": 2250, + "task_loss": 0.4539833068847656 + }, + { + "epoch": 0.81, + "eval_exact_match": 83.8221381267739, + "eval_f1": 90.09116234913313, + "step": 2250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1531752347946167, + "epoch": 0.82, + "learning_rate": 1.9099390357482943e-06, + "loss": 0.1697, + "step": 2260, + "task_loss": 0.1812114119529724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19936254620552063, + "epoch": 0.82, + "learning_rate": 1.9091525916746236e-06, + "loss": 0.1869, + "step": 2270, + "task_loss": 0.382361501455307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18323323130607605, + "epoch": 0.82, + "learning_rate": 1.9083628921168582e-06, + "loss": 0.174, + "step": 2280, + "task_loss": 0.30770325660705566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1802387237548828, + "epoch": 0.83, + "learning_rate": 1.9075699399027466e-06, + "loss": 0.1861, + "step": 2290, + "task_loss": 0.521061897277832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18234525620937347, + "epoch": 0.83, + "learning_rate": 1.9067737378716833e-06, + "loss": 0.2008, + "step": 2300, + "task_loss": 0.2811727225780487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16019967198371887, + "epoch": 0.83, + "learning_rate": 1.9059742888747002e-06, + "loss": 0.1948, + "step": 2310, + "task_loss": 0.4444176256656647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15703542530536652, + "epoch": 0.84, + "learning_rate": 1.9051715957744562e-06, + "loss": 0.1685, + "step": 2320, + "task_loss": 0.3662908673286438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14924360811710358, + "epoch": 0.84, + "learning_rate": 1.9043656614452257e-06, + "loss": 0.18, + "step": 2330, + "task_loss": 0.32921046018600464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15606483817100525, + "epoch": 0.85, + "learning_rate": 1.9035564887728907e-06, + "loss": 0.1725, + "step": 2340, + "task_loss": 0.5169017910957336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16825850307941437, + "epoch": 0.85, + "learning_rate": 1.902744080654928e-06, + "loss": 0.1805, + "step": 2350, + "task_loss": 0.493346244096756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22923272848129272, + "epoch": 0.85, + "learning_rate": 1.9019284400003998e-06, + "loss": 0.1885, + "step": 2360, + "task_loss": 0.4266737997531891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18409624695777893, + "epoch": 0.86, + "learning_rate": 1.901109569729944e-06, + "loss": 0.1706, + "step": 2370, + "task_loss": 0.2116602510213852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1556832194328308, + "epoch": 0.86, + "learning_rate": 1.9002874727757627e-06, + "loss": 0.1654, + "step": 2380, + "task_loss": 0.324906587600708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21795554459095, + "epoch": 0.86, + "learning_rate": 1.899462152081612e-06, + "loss": 0.1917, + "step": 2390, + "task_loss": 0.5224672555923462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1452036201953888, + "epoch": 0.87, + "learning_rate": 1.898633610602791e-06, + "loss": 0.1783, + "step": 2400, + "task_loss": 0.29410141706466675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14028596878051758, + "epoch": 0.87, + "learning_rate": 1.8978018513061333e-06, + "loss": 0.1796, + "step": 2410, + "task_loss": 0.22278541326522827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13686451315879822, + "epoch": 0.87, + "learning_rate": 1.8969668771699936e-06, + "loss": 0.1592, + "step": 2420, + "task_loss": 0.2901807129383087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17762663960456848, + "epoch": 0.88, + "learning_rate": 1.8961286911842385e-06, + "loss": 0.18, + "step": 2430, + "task_loss": 0.6580682992935181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15839552879333496, + "epoch": 0.88, + "learning_rate": 1.8952872963502354e-06, + "loss": 0.1748, + "step": 2440, + "task_loss": 0.33616119623184204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1614687591791153, + "epoch": 0.89, + "learning_rate": 1.8944426956808423e-06, + "loss": 0.1668, + "step": 2450, + "task_loss": 0.3434739112854004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1861266791820526, + "epoch": 0.89, + "learning_rate": 1.8935948922003964e-06, + "loss": 0.1747, + "step": 2460, + "task_loss": 0.5159826278686523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19838041067123413, + "epoch": 0.89, + "learning_rate": 1.8927438889447037e-06, + "loss": 0.1775, + "step": 2470, + "task_loss": 0.4625704288482666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14603759348392487, + "epoch": 0.9, + "learning_rate": 1.8918896889610276e-06, + "loss": 0.1915, + "step": 2480, + "task_loss": 0.7576199769973755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16346848011016846, + "epoch": 0.9, + "learning_rate": 1.8910322953080787e-06, + "loss": 0.1776, + "step": 2490, + "task_loss": 0.4902191162109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14848199486732483, + "epoch": 0.9, + "learning_rate": 1.890171711056003e-06, + "loss": 0.1751, + "step": 2500, + "task_loss": 0.4773310720920563 + }, + { + "epoch": 0.9, + "eval_exact_match": 83.52885525070955, + "eval_f1": 90.03965181607728, + "step": 2500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1388649195432663, + "epoch": 0.91, + "learning_rate": 1.8893079392863714e-06, + "loss": 0.1646, + "step": 2510, + "task_loss": 0.3831629157066345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13925893604755402, + "epoch": 0.91, + "learning_rate": 1.8884409830921692e-06, + "loss": 0.1754, + "step": 2520, + "task_loss": 0.4401072859764099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20444272458553314, + "epoch": 0.91, + "learning_rate": 1.887570845577784e-06, + "loss": 0.1794, + "step": 2530, + "task_loss": 0.545073390007019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15069428086280823, + "epoch": 0.92, + "learning_rate": 1.8866975298589949e-06, + "loss": 0.183, + "step": 2540, + "task_loss": 0.3420984745025635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1433373987674713, + "epoch": 0.92, + "learning_rate": 1.885821039062962e-06, + "loss": 0.1709, + "step": 2550, + "task_loss": 0.3048064112663269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14648103713989258, + "epoch": 0.93, + "learning_rate": 1.8849413763282144e-06, + "loss": 0.1836, + "step": 2560, + "task_loss": 0.4393615424633026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1583467423915863, + "epoch": 0.93, + "learning_rate": 1.8840585448046386e-06, + "loss": 0.1746, + "step": 2570, + "task_loss": 0.5558731555938721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1908012479543686, + "epoch": 0.93, + "learning_rate": 1.8831725476534693e-06, + "loss": 0.1818, + "step": 2580, + "task_loss": 0.3851405382156372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15493538975715637, + "epoch": 0.94, + "learning_rate": 1.882283388047275e-06, + "loss": 0.1796, + "step": 2590, + "task_loss": 0.4332561492919922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18247196078300476, + "epoch": 0.94, + "learning_rate": 1.88139106916995e-06, + "loss": 0.1842, + "step": 2600, + "task_loss": 0.3662012219429016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16121315956115723, + "epoch": 0.94, + "learning_rate": 1.8804955942167e-06, + "loss": 0.1717, + "step": 2610, + "task_loss": 0.7887633442878723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13026580214500427, + "epoch": 0.95, + "learning_rate": 1.879596966394032e-06, + "loss": 0.1763, + "step": 2620, + "task_loss": 0.3799874186515808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1766907274723053, + "epoch": 0.95, + "learning_rate": 1.8786951889197438e-06, + "loss": 0.178, + "step": 2630, + "task_loss": 0.4739248752593994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1579083502292633, + "epoch": 0.95, + "learning_rate": 1.8777902650229103e-06, + "loss": 0.1818, + "step": 2640, + "task_loss": 0.5713690519332886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19626328349113464, + "epoch": 0.96, + "learning_rate": 1.8768821979438739e-06, + "loss": 0.1771, + "step": 2650, + "task_loss": 0.3688851594924927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15492022037506104, + "epoch": 0.96, + "learning_rate": 1.875970990934231e-06, + "loss": 0.1747, + "step": 2660, + "task_loss": 0.36857369542121887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21618108451366425, + "epoch": 0.96, + "learning_rate": 1.875056647256823e-06, + "loss": 0.1856, + "step": 2670, + "task_loss": 0.5106911659240723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13856954872608185, + "epoch": 0.97, + "learning_rate": 1.8741391701857215e-06, + "loss": 0.1816, + "step": 2680, + "task_loss": 0.3294844627380371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14490175247192383, + "epoch": 0.97, + "learning_rate": 1.873218563006219e-06, + "loss": 0.1729, + "step": 2690, + "task_loss": 0.3456054627895355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.142696350812912, + "epoch": 0.98, + "learning_rate": 1.8722948290148161e-06, + "loss": 0.1744, + "step": 2700, + "task_loss": 0.3695463538169861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2125137597322464, + "epoch": 0.98, + "learning_rate": 1.8713679715192102e-06, + "loss": 0.1904, + "step": 2710, + "task_loss": 0.5505295991897583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1854153871536255, + "epoch": 0.98, + "learning_rate": 1.8704379938382822e-06, + "loss": 0.1877, + "step": 2720, + "task_loss": 0.4623679220676422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1481424868106842, + "epoch": 0.99, + "learning_rate": 1.869504899302087e-06, + "loss": 0.1965, + "step": 2730, + "task_loss": 0.26727068424224854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16843540966510773, + "epoch": 0.99, + "learning_rate": 1.8685686912518394e-06, + "loss": 0.1715, + "step": 2740, + "task_loss": 0.4420323967933655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15341581404209137, + "epoch": 0.99, + "learning_rate": 1.8676293730399038e-06, + "loss": 0.1783, + "step": 2750, + "task_loss": 0.701764702796936 + }, + { + "epoch": 0.99, + "eval_exact_match": 83.72753074739829, + "eval_f1": 90.08435171358782, + "step": 2750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19545786082744598, + "epoch": 1.0, + "learning_rate": 1.8666869480297808e-06, + "loss": 0.1802, + "step": 2760, + "task_loss": 0.30919766426086426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15790414810180664, + "epoch": 1.0, + "learning_rate": 1.8657414195960958e-06, + "loss": 0.164, + "step": 2770, + "task_loss": 0.6185303926467896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18240132927894592, + "epoch": 1.0, + "learning_rate": 1.8647927911245875e-06, + "loss": 0.1829, + "step": 2780, + "task_loss": 0.4996330142021179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17760756611824036, + "epoch": 1.01, + "learning_rate": 1.8638410660120947e-06, + "loss": 0.1736, + "step": 2790, + "task_loss": 0.34335148334503174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14696209132671356, + "epoch": 1.01, + "learning_rate": 1.8628862476665448e-06, + "loss": 0.1828, + "step": 2800, + "task_loss": 0.42722922563552856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15718209743499756, + "epoch": 1.02, + "learning_rate": 1.8619283395069409e-06, + "loss": 0.1764, + "step": 2810, + "task_loss": 0.35283511877059937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15315774083137512, + "epoch": 1.02, + "learning_rate": 1.8609673449633513e-06, + "loss": 0.1858, + "step": 2820, + "task_loss": 0.38201355934143066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1411234736442566, + "epoch": 1.02, + "learning_rate": 1.8600032674768947e-06, + "loss": 0.1745, + "step": 2830, + "task_loss": 0.5564037561416626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14134261012077332, + "epoch": 1.03, + "learning_rate": 1.8590361104997298e-06, + "loss": 0.1873, + "step": 2840, + "task_loss": 0.28306859731674194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15581990778446198, + "epoch": 1.03, + "learning_rate": 1.858065877495042e-06, + "loss": 0.1664, + "step": 2850, + "task_loss": 0.5072811245918274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1485014408826828, + "epoch": 1.03, + "learning_rate": 1.857092571937032e-06, + "loss": 0.1805, + "step": 2860, + "task_loss": 0.48709845542907715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16704809665679932, + "epoch": 1.04, + "learning_rate": 1.8561161973109014e-06, + "loss": 0.1867, + "step": 2870, + "task_loss": 0.6069858074188232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.148666113615036, + "epoch": 1.04, + "learning_rate": 1.8551367571128429e-06, + "loss": 0.1896, + "step": 2880, + "task_loss": 0.3557353913784027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16260361671447754, + "epoch": 1.04, + "learning_rate": 1.8541542548500256e-06, + "loss": 0.1756, + "step": 2890, + "task_loss": 0.416062593460083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1194634884595871, + "epoch": 1.05, + "learning_rate": 1.853168694040583e-06, + "loss": 0.1718, + "step": 2900, + "task_loss": 0.17556090652942657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1434130072593689, + "epoch": 1.05, + "learning_rate": 1.8521800782136014e-06, + "loss": 0.1679, + "step": 2910, + "task_loss": 0.4673941135406494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21220733225345612, + "epoch": 1.06, + "learning_rate": 1.851188410909106e-06, + "loss": 0.1763, + "step": 2920, + "task_loss": 0.562555730342865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1408117711544037, + "epoch": 1.06, + "learning_rate": 1.850193695678048e-06, + "loss": 0.1688, + "step": 2930, + "task_loss": 0.3450365662574768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15347136557102203, + "epoch": 1.06, + "learning_rate": 1.8491959360822938e-06, + "loss": 0.1821, + "step": 2940, + "task_loss": 0.5536178350448608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18112389743328094, + "epoch": 1.07, + "learning_rate": 1.848195135694611e-06, + "loss": 0.182, + "step": 2950, + "task_loss": 0.5740102529525757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15279173851013184, + "epoch": 1.07, + "learning_rate": 1.8471912980986544e-06, + "loss": 0.1787, + "step": 2960, + "task_loss": 0.16615566611289978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16220858693122864, + "epoch": 1.07, + "learning_rate": 1.846285250419875e-06, + "loss": 0.1767, + "step": 2970, + "task_loss": 0.47266218066215515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18561126291751862, + "epoch": 1.08, + "learning_rate": 1.8452756520401107e-06, + "loss": 0.1785, + "step": 2980, + "task_loss": 0.4946807622909546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14157789945602417, + "epoch": 1.08, + "learning_rate": 1.8442630269061292e-06, + "loss": 0.1778, + "step": 2990, + "task_loss": 0.34104907512664795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11815983057022095, + "epoch": 1.08, + "learning_rate": 1.8432473786439283e-06, + "loss": 0.1527, + "step": 3000, + "task_loss": 0.1873714029788971 + }, + { + "epoch": 1.08, + "eval_exact_match": 83.44370860927152, + "eval_f1": 89.90493527722583, + "step": 3000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16796885430812836, + "epoch": 1.09, + "learning_rate": 1.8422287108903304e-06, + "loss": 0.1745, + "step": 3010, + "task_loss": 0.2368674874305725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16812004148960114, + "epoch": 1.09, + "learning_rate": 1.841207027292971e-06, + "loss": 0.1866, + "step": 3020, + "task_loss": 0.6989672183990479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16153055429458618, + "epoch": 1.1, + "learning_rate": 1.8401823315102833e-06, + "loss": 0.178, + "step": 3030, + "task_loss": 0.43721848726272583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12805390357971191, + "epoch": 1.1, + "learning_rate": 1.8391546272114878e-06, + "loss": 0.1755, + "step": 3040, + "task_loss": 0.2398868352174759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1456378549337387, + "epoch": 1.1, + "learning_rate": 1.8381239180765768e-06, + "loss": 0.1684, + "step": 3050, + "task_loss": 0.5672136545181274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15030169486999512, + "epoch": 1.11, + "learning_rate": 1.837090207796303e-06, + "loss": 0.1841, + "step": 3060, + "task_loss": 0.4073163568973541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1696191281080246, + "epoch": 1.11, + "learning_rate": 1.8360535000721655e-06, + "loss": 0.1798, + "step": 3070, + "task_loss": 0.5377500057220459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18132445216178894, + "epoch": 1.11, + "learning_rate": 1.8350137986163965e-06, + "loss": 0.1859, + "step": 3080, + "task_loss": 0.5428669452667236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12620098888874054, + "epoch": 1.12, + "learning_rate": 1.8339711071519482e-06, + "loss": 0.1726, + "step": 3090, + "task_loss": 0.36496835947036743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12938401103019714, + "epoch": 1.12, + "learning_rate": 1.8329254294124787e-06, + "loss": 0.1684, + "step": 3100, + "task_loss": 0.3683048486709595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17861530184745789, + "epoch": 1.12, + "learning_rate": 1.8318767691423402e-06, + "loss": 0.1882, + "step": 3110, + "task_loss": 0.43564486503601074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1419188380241394, + "epoch": 1.13, + "learning_rate": 1.830825130096565e-06, + "loss": 0.1769, + "step": 3120, + "task_loss": 0.5064725875854492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18351003527641296, + "epoch": 1.13, + "learning_rate": 1.8297705160408503e-06, + "loss": 0.1623, + "step": 3130, + "task_loss": 0.4635063409805298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1454399824142456, + "epoch": 1.13, + "learning_rate": 1.8287129307515478e-06, + "loss": 0.1687, + "step": 3140, + "task_loss": 0.3754516839981079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1441243588924408, + "epoch": 1.14, + "learning_rate": 1.8276523780156474e-06, + "loss": 0.1763, + "step": 3150, + "task_loss": 0.3571741580963135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15150251984596252, + "epoch": 1.14, + "learning_rate": 1.8265888616307657e-06, + "loss": 0.1837, + "step": 3160, + "task_loss": 0.34464773535728455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13545379042625427, + "epoch": 1.15, + "learning_rate": 1.8255223854051305e-06, + "loss": 0.1715, + "step": 3170, + "task_loss": 0.3557414412498474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1452762484550476, + "epoch": 1.15, + "learning_rate": 1.824452953157569e-06, + "loss": 0.1874, + "step": 3180, + "task_loss": 0.3559718728065491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14096008241176605, + "epoch": 1.15, + "learning_rate": 1.823380568717493e-06, + "loss": 0.1848, + "step": 3190, + "task_loss": 0.4971044063568115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2189292013645172, + "epoch": 1.16, + "learning_rate": 1.8223052359248854e-06, + "loss": 0.1742, + "step": 3200, + "task_loss": 0.3672422766685486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1771106719970703, + "epoch": 1.16, + "learning_rate": 1.821226958630287e-06, + "loss": 0.1686, + "step": 3210, + "task_loss": 0.43500882387161255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12642604112625122, + "epoch": 1.16, + "learning_rate": 1.8201457406947814e-06, + "loss": 0.166, + "step": 3220, + "task_loss": 0.1765107810497284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16896989941596985, + "epoch": 1.17, + "learning_rate": 1.8190615859899824e-06, + "loss": 0.1816, + "step": 3230, + "task_loss": 0.4253600835800171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18833866715431213, + "epoch": 1.17, + "learning_rate": 1.8179744983980206e-06, + "loss": 0.1822, + "step": 3240, + "task_loss": 0.48154908418655396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21106314659118652, + "epoch": 1.17, + "learning_rate": 1.816884481811527e-06, + "loss": 0.1856, + "step": 3250, + "task_loss": 0.4936971664428711 + }, + { + "epoch": 1.17, + "eval_exact_match": 83.6329233680227, + "eval_f1": 89.95397066155324, + "step": 3250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13803541660308838, + "epoch": 1.18, + "learning_rate": 1.8157915401336218e-06, + "loss": 0.1726, + "step": 3260, + "task_loss": 0.18990027904510498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19129733741283417, + "epoch": 1.18, + "learning_rate": 1.8148053949047202e-06, + "loss": 0.1913, + "step": 3270, + "task_loss": 0.3831537961959839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15445619821548462, + "epoch": 1.19, + "learning_rate": 1.8137069063437304e-06, + "loss": 0.1686, + "step": 3280, + "task_loss": 0.43404531478881836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18276625871658325, + "epoch": 1.19, + "learning_rate": 1.8126055040695588e-06, + "loss": 0.1867, + "step": 3290, + "task_loss": 0.5082448720932007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16971507668495178, + "epoch": 1.19, + "learning_rate": 1.8115011920260946e-06, + "loss": 0.1896, + "step": 3300, + "task_loss": 0.4748695194721222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15451842546463013, + "epoch": 1.2, + "learning_rate": 1.8103939741676465e-06, + "loss": 0.1619, + "step": 3310, + "task_loss": 0.5286293029785156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16968689858913422, + "epoch": 1.2, + "learning_rate": 1.8092838544589287e-06, + "loss": 0.1662, + "step": 3320, + "task_loss": 0.19997572898864746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1290392279624939, + "epoch": 1.2, + "learning_rate": 1.8081708368750466e-06, + "loss": 0.1628, + "step": 3330, + "task_loss": 0.3563808798789978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12694013118743896, + "epoch": 1.21, + "learning_rate": 1.8070549254014816e-06, + "loss": 0.1722, + "step": 3340, + "task_loss": 0.3265552818775177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1317066103219986, + "epoch": 1.21, + "learning_rate": 1.8059361240340782e-06, + "loss": 0.1568, + "step": 3350, + "task_loss": 0.26479342579841614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14310331642627716, + "epoch": 1.21, + "learning_rate": 1.8048144367790284e-06, + "loss": 0.1919, + "step": 3360, + "task_loss": 0.3072636127471924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15124750137329102, + "epoch": 1.22, + "learning_rate": 1.803689867652858e-06, + "loss": 0.1826, + "step": 3370, + "task_loss": 0.5191828608512878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15719471871852875, + "epoch": 1.22, + "learning_rate": 1.802562420682413e-06, + "loss": 0.1727, + "step": 3380, + "task_loss": 0.3152187466621399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1399240493774414, + "epoch": 1.23, + "learning_rate": 1.8014320999048426e-06, + "loss": 0.1718, + "step": 3390, + "task_loss": 0.2820362448692322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16205359995365143, + "epoch": 1.23, + "learning_rate": 1.800298909367589e-06, + "loss": 0.1843, + "step": 3400, + "task_loss": 0.36751991510391235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1521889865398407, + "epoch": 1.23, + "learning_rate": 1.799162853128368e-06, + "loss": 0.1557, + "step": 3410, + "task_loss": 0.3483598828315735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1441453993320465, + "epoch": 1.24, + "learning_rate": 1.7980239352551582e-06, + "loss": 0.1681, + "step": 3420, + "task_loss": 0.5732893943786621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1875881552696228, + "epoch": 1.24, + "learning_rate": 1.7968821598261852e-06, + "loss": 0.1703, + "step": 3430, + "task_loss": 0.37429314851760864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.23559153079986572, + "epoch": 1.24, + "learning_rate": 1.7957375309299058e-06, + "loss": 0.177, + "step": 3440, + "task_loss": 0.503436803817749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15299516916275024, + "epoch": 1.25, + "learning_rate": 1.7945900526649957e-06, + "loss": 0.1775, + "step": 3450, + "task_loss": 0.34383100271224976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11944176256656647, + "epoch": 1.25, + "learning_rate": 1.793439729140333e-06, + "loss": 0.1805, + "step": 3460, + "task_loss": 0.1734773814678192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1728697270154953, + "epoch": 1.25, + "learning_rate": 1.7922865644749843e-06, + "loss": 0.167, + "step": 3470, + "task_loss": 0.39709553122520447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1935918629169464, + "epoch": 1.26, + "learning_rate": 1.7911305627981892e-06, + "loss": 0.1799, + "step": 3480, + "task_loss": 0.4305168092250824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14809994399547577, + "epoch": 1.26, + "learning_rate": 1.7899717282493463e-06, + "loss": 0.1799, + "step": 3490, + "task_loss": 0.5099273920059204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15830612182617188, + "epoch": 1.26, + "learning_rate": 1.7888100649779986e-06, + "loss": 0.1738, + "step": 3500, + "task_loss": 0.38358789682388306 + }, + { + "epoch": 1.26, + "eval_exact_match": 83.50047303689688, + "eval_f1": 89.96018049046944, + "step": 3500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15528270602226257, + "epoch": 1.27, + "learning_rate": 1.7876455771438178e-06, + "loss": 0.1765, + "step": 3510, + "task_loss": 0.24267150461673737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14513492584228516, + "epoch": 1.27, + "learning_rate": 1.7864782689165901e-06, + "loss": 0.1703, + "step": 3520, + "task_loss": 0.551958441734314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13601109385490417, + "epoch": 1.28, + "learning_rate": 1.7853081444761998e-06, + "loss": 0.1683, + "step": 3530, + "task_loss": 0.2686072289943695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13815787434577942, + "epoch": 1.28, + "learning_rate": 1.7841352080126164e-06, + "loss": 0.1823, + "step": 3540, + "task_loss": 0.5631309747695923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18684223294258118, + "epoch": 1.28, + "learning_rate": 1.7829594637258792e-06, + "loss": 0.1765, + "step": 3550, + "task_loss": 0.40622183680534363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16199585795402527, + "epoch": 1.29, + "learning_rate": 1.7817809158260805e-06, + "loss": 0.1833, + "step": 3560, + "task_loss": 0.537696897983551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13808155059814453, + "epoch": 1.29, + "learning_rate": 1.7805995685333524e-06, + "loss": 0.1705, + "step": 3570, + "task_loss": 0.26295095682144165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21583780646324158, + "epoch": 1.29, + "learning_rate": 1.7794154260778507e-06, + "loss": 0.1735, + "step": 3580, + "task_loss": 0.4579695463180542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22117865085601807, + "epoch": 1.3, + "learning_rate": 1.778228492699741e-06, + "loss": 0.1788, + "step": 3590, + "task_loss": 0.6541140079498291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17737066745758057, + "epoch": 1.3, + "learning_rate": 1.7770387726491812e-06, + "loss": 0.1802, + "step": 3600, + "task_loss": 0.3846840262413025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18559321761131287, + "epoch": 1.3, + "learning_rate": 1.7758462701863084e-06, + "loss": 0.1833, + "step": 3610, + "task_loss": 0.43720221519470215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15115341544151306, + "epoch": 1.31, + "learning_rate": 1.7746509895812238e-06, + "loss": 0.1638, + "step": 3620, + "task_loss": 0.44333165884017944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15191200375556946, + "epoch": 1.31, + "learning_rate": 1.773452935113975e-06, + "loss": 0.1797, + "step": 3630, + "task_loss": 0.5943065881729126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1316663920879364, + "epoch": 1.32, + "learning_rate": 1.7722521110745427e-06, + "loss": 0.1579, + "step": 3640, + "task_loss": 0.30335086584091187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14825601875782013, + "epoch": 1.32, + "learning_rate": 1.7710485217628262e-06, + "loss": 0.1763, + "step": 3650, + "task_loss": 0.2760236859321594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1533227413892746, + "epoch": 1.32, + "learning_rate": 1.7698421714886243e-06, + "loss": 0.1824, + "step": 3660, + "task_loss": 0.2671685218811035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16644388437271118, + "epoch": 1.33, + "learning_rate": 1.768633064571624e-06, + "loss": 0.1936, + "step": 3670, + "task_loss": 0.49052125215530396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18509739637374878, + "epoch": 1.33, + "learning_rate": 1.7674212053413822e-06, + "loss": 0.1649, + "step": 3680, + "task_loss": 0.2983229160308838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14358261227607727, + "epoch": 1.33, + "learning_rate": 1.7662065981373124e-06, + "loss": 0.1789, + "step": 3690, + "task_loss": 0.28044524788856506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19266170263290405, + "epoch": 1.34, + "learning_rate": 1.7649892473086674e-06, + "loss": 0.1825, + "step": 3700, + "task_loss": 0.2881600260734558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15574830770492554, + "epoch": 1.34, + "learning_rate": 1.763769157214524e-06, + "loss": 0.1734, + "step": 3710, + "task_loss": 0.33556947112083435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12162809073925018, + "epoch": 1.34, + "learning_rate": 1.7625463322237679e-06, + "loss": 0.1706, + "step": 3720, + "task_loss": 0.37808844447135925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18673118948936462, + "epoch": 1.35, + "learning_rate": 1.7613207767150783e-06, + "loss": 0.1778, + "step": 3730, + "task_loss": 0.536344587802887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12695255875587463, + "epoch": 1.35, + "learning_rate": 1.7600924950769117e-06, + "loss": 0.1722, + "step": 3740, + "task_loss": 0.6155173778533936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14430446922779083, + "epoch": 1.36, + "learning_rate": 1.758861491707486e-06, + "loss": 0.1741, + "step": 3750, + "task_loss": 0.35061925649642944 + }, + { + "epoch": 1.36, + "eval_exact_match": 83.4720908230842, + "eval_f1": 89.95138805784273, + "step": 3750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1242603063583374, + "epoch": 1.36, + "learning_rate": 1.757627771014765e-06, + "loss": 0.1656, + "step": 3760, + "task_loss": 0.42466431856155396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1508714258670807, + "epoch": 1.36, + "learning_rate": 1.756391337416443e-06, + "loss": 0.1697, + "step": 3770, + "task_loss": 0.4327700436115265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1856732964515686, + "epoch": 1.37, + "learning_rate": 1.7551521953399286e-06, + "loss": 0.1926, + "step": 3780, + "task_loss": 0.5305187702178955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1641940474510193, + "epoch": 1.37, + "learning_rate": 1.7539103492223286e-06, + "loss": 0.164, + "step": 3790, + "task_loss": 0.3325423002243042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15839411318302155, + "epoch": 1.37, + "learning_rate": 1.752665803510433e-06, + "loss": 0.1747, + "step": 3800, + "task_loss": 0.4586430788040161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12557630240917206, + "epoch": 1.38, + "learning_rate": 1.7514185626606972e-06, + "loss": 0.1589, + "step": 3810, + "task_loss": 0.3240758180618286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1498473584651947, + "epoch": 1.38, + "learning_rate": 1.7501686311392292e-06, + "loss": 0.1709, + "step": 3820, + "task_loss": 0.33482322096824646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14660826325416565, + "epoch": 1.38, + "learning_rate": 1.7489160134217702e-06, + "loss": 0.1773, + "step": 3830, + "task_loss": 0.3380383253097534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17524400353431702, + "epoch": 1.39, + "learning_rate": 1.7476607139936807e-06, + "loss": 0.1801, + "step": 3840, + "task_loss": 0.5500714182853699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18646281957626343, + "epoch": 1.39, + "learning_rate": 1.746402737349924e-06, + "loss": 0.1829, + "step": 3850, + "task_loss": 0.4430808126926422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18419930338859558, + "epoch": 1.4, + "learning_rate": 1.7451420879950491e-06, + "loss": 0.1766, + "step": 3860, + "task_loss": 0.699094295501709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1590745896100998, + "epoch": 1.4, + "learning_rate": 1.7438787704431765e-06, + "loss": 0.1794, + "step": 3870, + "task_loss": 0.5042922496795654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16868017613887787, + "epoch": 1.4, + "learning_rate": 1.7426127892179805e-06, + "loss": 0.1618, + "step": 3880, + "task_loss": 0.6242218613624573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12290704250335693, + "epoch": 1.41, + "learning_rate": 1.7413441488526734e-06, + "loss": 0.1671, + "step": 3890, + "task_loss": 0.33300209045410156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1805894672870636, + "epoch": 1.41, + "learning_rate": 1.7400728538899893e-06, + "loss": 0.1794, + "step": 3900, + "task_loss": 0.4865798056125641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13778156042099, + "epoch": 1.41, + "learning_rate": 1.7387989088821677e-06, + "loss": 0.1745, + "step": 3910, + "task_loss": 0.36040520668029785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18057343363761902, + "epoch": 1.42, + "learning_rate": 1.7375223183909378e-06, + "loss": 0.1818, + "step": 3920, + "task_loss": 0.43555018305778503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16372817754745483, + "epoch": 1.42, + "learning_rate": 1.7362430869875017e-06, + "loss": 0.1622, + "step": 3930, + "task_loss": 0.3211020827293396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18226927518844604, + "epoch": 1.42, + "learning_rate": 1.7349612192525176e-06, + "loss": 0.1748, + "step": 3940, + "task_loss": 0.39155834913253784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17144691944122314, + "epoch": 1.43, + "learning_rate": 1.7336767197760837e-06, + "loss": 0.1634, + "step": 3950, + "task_loss": 0.3620833158493042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17456424236297607, + "epoch": 1.43, + "learning_rate": 1.7323895931577228e-06, + "loss": 0.1813, + "step": 3960, + "task_loss": 0.5210827589035034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1263851523399353, + "epoch": 1.43, + "learning_rate": 1.7310998440063647e-06, + "loss": 0.1715, + "step": 3970, + "task_loss": 0.5859290361404419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.129383385181427, + "epoch": 1.44, + "learning_rate": 1.7298074769403285e-06, + "loss": 0.1638, + "step": 3980, + "task_loss": 0.31362149119377136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.23324280977249146, + "epoch": 1.44, + "learning_rate": 1.72851249658731e-06, + "loss": 0.1804, + "step": 3990, + "task_loss": 0.451979398727417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15351605415344238, + "epoch": 1.45, + "learning_rate": 1.727214907584361e-06, + "loss": 0.1647, + "step": 4000, + "task_loss": 0.33222532272338867 + }, + { + "epoch": 1.45, + "eval_exact_match": 83.66130558183538, + "eval_f1": 90.1062476420109, + "step": 4000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16170555353164673, + "epoch": 1.45, + "learning_rate": 1.725914714577874e-06, + "loss": 0.1619, + "step": 4010, + "task_loss": 0.5656744837760925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14653730392456055, + "epoch": 1.45, + "learning_rate": 1.724611922223567e-06, + "loss": 0.172, + "step": 4020, + "task_loss": 0.5113309621810913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13656997680664062, + "epoch": 1.46, + "learning_rate": 1.7233065351864652e-06, + "loss": 0.171, + "step": 4030, + "task_loss": 0.24282409250736237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13946852087974548, + "epoch": 1.46, + "learning_rate": 1.7219985581408847e-06, + "loss": 0.1782, + "step": 4040, + "task_loss": 0.6446897983551025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.27610087394714355, + "epoch": 1.46, + "learning_rate": 1.7206879957704161e-06, + "loss": 0.1864, + "step": 4050, + "task_loss": 0.7376024127006531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20331989228725433, + "epoch": 1.47, + "learning_rate": 1.7193748527679074e-06, + "loss": 0.1816, + "step": 4060, + "task_loss": 0.6629816293716431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15988487005233765, + "epoch": 1.47, + "learning_rate": 1.7180591338354479e-06, + "loss": 0.1694, + "step": 4070, + "task_loss": 0.46234872937202454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1611352115869522, + "epoch": 1.47, + "learning_rate": 1.7167408436843493e-06, + "loss": 0.165, + "step": 4080, + "task_loss": 0.4623450040817261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15372850000858307, + "epoch": 1.48, + "learning_rate": 1.7154199870351319e-06, + "loss": 0.1701, + "step": 4090, + "task_loss": 0.3854554295539856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12263274192810059, + "epoch": 1.48, + "learning_rate": 1.7140965686175047e-06, + "loss": 0.1648, + "step": 4100, + "task_loss": 0.3795510530471802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14757771790027618, + "epoch": 1.49, + "learning_rate": 1.7127705931703511e-06, + "loss": 0.1677, + "step": 4110, + "task_loss": 0.5079241991043091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19403065741062164, + "epoch": 1.49, + "learning_rate": 1.7114420654417102e-06, + "loss": 0.1749, + "step": 4120, + "task_loss": 0.6132655143737793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17154933512210846, + "epoch": 1.49, + "learning_rate": 1.7101109901887594e-06, + "loss": 0.1698, + "step": 4130, + "task_loss": 0.5632673501968384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15782007575035095, + "epoch": 1.5, + "learning_rate": 1.7087773721777998e-06, + "loss": 0.1682, + "step": 4140, + "task_loss": 0.29379528760910034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15191909670829773, + "epoch": 1.5, + "learning_rate": 1.7074412161842368e-06, + "loss": 0.1821, + "step": 4150, + "task_loss": 0.40577489137649536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1590431034564972, + "epoch": 1.5, + "learning_rate": 1.7061025269925633e-06, + "loss": 0.1725, + "step": 4160, + "task_loss": 0.3673279285430908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15804147720336914, + "epoch": 1.51, + "learning_rate": 1.704761309396344e-06, + "loss": 0.1744, + "step": 4170, + "task_loss": 0.3593347668647766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22128893435001373, + "epoch": 1.51, + "learning_rate": 1.7034175681981969e-06, + "loss": 0.1745, + "step": 4180, + "task_loss": 0.3887554407119751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17814043164253235, + "epoch": 1.51, + "learning_rate": 1.702071308209776e-06, + "loss": 0.1842, + "step": 4190, + "task_loss": 0.37278926372528076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1720452755689621, + "epoch": 1.52, + "learning_rate": 1.7007225342517554e-06, + "loss": 0.1618, + "step": 4200, + "task_loss": 0.3401908576488495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18010592460632324, + "epoch": 1.52, + "learning_rate": 1.6993712511538108e-06, + "loss": 0.1714, + "step": 4210, + "task_loss": 0.35429495573043823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14852438867092133, + "epoch": 1.53, + "learning_rate": 1.6980174637546022e-06, + "loss": 0.176, + "step": 4220, + "task_loss": 0.2848626971244812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12181121110916138, + "epoch": 1.53, + "learning_rate": 1.6966611769017574e-06, + "loss": 0.1731, + "step": 4230, + "task_loss": 0.234962597489357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17690081894397736, + "epoch": 1.53, + "learning_rate": 1.6953023954518546e-06, + "loss": 0.1816, + "step": 4240, + "task_loss": 0.3170431852340698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1582062691450119, + "epoch": 1.54, + "learning_rate": 1.6939411242704037e-06, + "loss": 0.1727, + "step": 4250, + "task_loss": 0.6310293674468994 + }, + { + "epoch": 1.54, + "eval_exact_match": 83.66130558183538, + "eval_f1": 90.01785833074021, + "step": 4250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13817504048347473, + "epoch": 1.54, + "learning_rate": 1.6925773682318312e-06, + "loss": 0.168, + "step": 4260, + "task_loss": 0.29341834783554077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18749260902404785, + "epoch": 1.54, + "learning_rate": 1.6912111322194594e-06, + "loss": 0.1685, + "step": 4270, + "task_loss": 0.5627535581588745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14412984251976013, + "epoch": 1.55, + "learning_rate": 1.6898424211254927e-06, + "loss": 0.1639, + "step": 4280, + "task_loss": 0.2700430452823639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15190498530864716, + "epoch": 1.55, + "learning_rate": 1.6884712398509966e-06, + "loss": 0.1656, + "step": 4290, + "task_loss": 0.2561272382736206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13519339263439178, + "epoch": 1.55, + "learning_rate": 1.6870975933058835e-06, + "loss": 0.1663, + "step": 4300, + "task_loss": 0.3266107439994812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15758942067623138, + "epoch": 1.56, + "learning_rate": 1.6857214864088927e-06, + "loss": 0.1749, + "step": 4310, + "task_loss": 0.4796781837940216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18209287524223328, + "epoch": 1.56, + "learning_rate": 1.6843429240875726e-06, + "loss": 0.1747, + "step": 4320, + "task_loss": 0.3897198438644409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16047507524490356, + "epoch": 1.56, + "learning_rate": 1.6829619112782654e-06, + "loss": 0.1644, + "step": 4330, + "task_loss": 0.4120858609676361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1207498162984848, + "epoch": 1.57, + "learning_rate": 1.6815784529260868e-06, + "loss": 0.1752, + "step": 4340, + "task_loss": 0.3987932801246643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14837431907653809, + "epoch": 1.57, + "learning_rate": 1.6801925539849102e-06, + "loss": 0.1675, + "step": 4350, + "task_loss": 0.4159763753414154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16814565658569336, + "epoch": 1.58, + "learning_rate": 1.6788042194173485e-06, + "loss": 0.1751, + "step": 4360, + "task_loss": 0.38205575942993164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1617308259010315, + "epoch": 1.58, + "learning_rate": 1.6774134541947351e-06, + "loss": 0.1879, + "step": 4370, + "task_loss": 0.35478347539901733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16979068517684937, + "epoch": 1.58, + "learning_rate": 1.6760202632971074e-06, + "loss": 0.1883, + "step": 4380, + "task_loss": 0.4965069890022278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20151479542255402, + "epoch": 1.59, + "learning_rate": 1.6746246517131894e-06, + "loss": 0.1801, + "step": 4390, + "task_loss": 0.4965303838253021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1382327377796173, + "epoch": 1.59, + "learning_rate": 1.6732266244403722e-06, + "loss": 0.1728, + "step": 4400, + "task_loss": 0.41574716567993164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17530761659145355, + "epoch": 1.59, + "learning_rate": 1.6718261864846968e-06, + "loss": 0.1828, + "step": 4410, + "task_loss": 0.518844485282898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15779836475849152, + "epoch": 1.6, + "learning_rate": 1.6704233428608376e-06, + "loss": 0.1705, + "step": 4420, + "task_loss": 0.3203689754009247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1792672872543335, + "epoch": 1.6, + "learning_rate": 1.6690180985920818e-06, + "loss": 0.1742, + "step": 4430, + "task_loss": 0.34858888387680054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15927091240882874, + "epoch": 1.6, + "learning_rate": 1.6676104587103137e-06, + "loss": 0.1661, + "step": 4440, + "task_loss": 0.42672163248062134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1978459358215332, + "epoch": 1.61, + "learning_rate": 1.666200428255995e-06, + "loss": 0.1701, + "step": 4450, + "task_loss": 0.42373642325401306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12445167452096939, + "epoch": 1.61, + "learning_rate": 1.6647880122781487e-06, + "loss": 0.1834, + "step": 4460, + "task_loss": 0.3997802138328552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20981666445732117, + "epoch": 1.62, + "learning_rate": 1.6633732158343386e-06, + "loss": 0.18, + "step": 4470, + "task_loss": 0.5796436667442322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19721180200576782, + "epoch": 1.62, + "learning_rate": 1.6619560439906533e-06, + "loss": 0.174, + "step": 4480, + "task_loss": 0.5064884424209595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11864525079727173, + "epoch": 1.62, + "learning_rate": 1.6605365018216867e-06, + "loss": 0.1624, + "step": 4490, + "task_loss": 0.33762532472610474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16903159022331238, + "epoch": 1.63, + "learning_rate": 1.659114594410521e-06, + "loss": 0.1668, + "step": 4500, + "task_loss": 0.3473363518714905 + }, + { + "epoch": 1.63, + "eval_exact_match": 83.71807000946073, + "eval_f1": 90.03319320561076, + "step": 4500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11900688707828522, + "epoch": 1.63, + "learning_rate": 1.6576903268487068e-06, + "loss": 0.1645, + "step": 4510, + "task_loss": 0.38987505435943604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1665954291820526, + "epoch": 1.63, + "learning_rate": 1.6562637042362466e-06, + "loss": 0.1797, + "step": 4520, + "task_loss": 0.6573120951652527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21217182278633118, + "epoch": 1.64, + "learning_rate": 1.6548347316815762e-06, + "loss": 0.1782, + "step": 4530, + "task_loss": 0.5453046560287476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13434047996997833, + "epoch": 1.64, + "learning_rate": 1.6534034143015454e-06, + "loss": 0.1626, + "step": 4540, + "task_loss": 0.31057432293891907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16948871314525604, + "epoch": 1.64, + "learning_rate": 1.6519697572214003e-06, + "loss": 0.1811, + "step": 4550, + "task_loss": 0.40966400504112244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14870113134384155, + "epoch": 1.65, + "learning_rate": 1.6505337655747651e-06, + "loss": 0.1712, + "step": 4560, + "task_loss": 0.46824079751968384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17549389600753784, + "epoch": 1.65, + "learning_rate": 1.649095444503624e-06, + "loss": 0.172, + "step": 4570, + "task_loss": 0.47035887837409973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16029421985149384, + "epoch": 1.66, + "learning_rate": 1.647654799158302e-06, + "loss": 0.1705, + "step": 4580, + "task_loss": 0.556218147277832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17698289453983307, + "epoch": 1.66, + "learning_rate": 1.6462118346974465e-06, + "loss": 0.1826, + "step": 4590, + "task_loss": 0.48089244961738586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15601950883865356, + "epoch": 1.66, + "learning_rate": 1.6447665562880102e-06, + "loss": 0.1757, + "step": 4600, + "task_loss": 0.5369211435317993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18061840534210205, + "epoch": 1.67, + "learning_rate": 1.6433189691052304e-06, + "loss": 0.1852, + "step": 4610, + "task_loss": 0.5473412275314331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16047485172748566, + "epoch": 1.67, + "learning_rate": 1.6418690783326124e-06, + "loss": 0.1727, + "step": 4620, + "task_loss": 0.38301435112953186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.213972270488739, + "epoch": 1.67, + "learning_rate": 1.6404168891619099e-06, + "loss": 0.1884, + "step": 4630, + "task_loss": 0.4033554196357727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19080013036727905, + "epoch": 1.68, + "learning_rate": 1.6389624067931063e-06, + "loss": 0.1805, + "step": 4640, + "task_loss": 0.3459513783454895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19626328349113464, + "epoch": 1.68, + "learning_rate": 1.6375056364343976e-06, + "loss": 0.181, + "step": 4650, + "task_loss": 0.40217965841293335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13872280716896057, + "epoch": 1.68, + "learning_rate": 1.6360465833021714e-06, + "loss": 0.1681, + "step": 4660, + "task_loss": 0.6281946301460266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17640194296836853, + "epoch": 1.69, + "learning_rate": 1.6345852526209898e-06, + "loss": 0.1802, + "step": 4670, + "task_loss": 0.37003079056739807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1716892421245575, + "epoch": 1.69, + "learning_rate": 1.6331216496235704e-06, + "loss": 0.1797, + "step": 4680, + "task_loss": 0.26079100370407104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16866642236709595, + "epoch": 1.69, + "learning_rate": 1.6316557795507681e-06, + "loss": 0.1798, + "step": 4690, + "task_loss": 0.6609709858894348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14990170300006866, + "epoch": 1.7, + "learning_rate": 1.6301876476515543e-06, + "loss": 0.1595, + "step": 4700, + "task_loss": 0.40529322624206543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15224753320217133, + "epoch": 1.7, + "learning_rate": 1.6287172591830013e-06, + "loss": 0.1736, + "step": 4710, + "task_loss": 0.31739816069602966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1693233698606491, + "epoch": 1.71, + "learning_rate": 1.62724461941026e-06, + "loss": 0.1736, + "step": 4720, + "task_loss": 0.27699169516563416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14131656289100647, + "epoch": 1.71, + "learning_rate": 1.6257697336065437e-06, + "loss": 0.1751, + "step": 4730, + "task_loss": 0.2714840769767761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17337501049041748, + "epoch": 1.71, + "learning_rate": 1.6242926070531081e-06, + "loss": 0.1853, + "step": 4740, + "task_loss": 0.4388749599456787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1466250717639923, + "epoch": 1.72, + "learning_rate": 1.6228132450392327e-06, + "loss": 0.1855, + "step": 4750, + "task_loss": 0.7262935042381287 + }, + { + "epoch": 1.72, + "eval_exact_match": 83.65184484389782, + "eval_f1": 90.0956056491023, + "step": 4750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19323478639125824, + "epoch": 1.72, + "learning_rate": 1.6213316528622013e-06, + "loss": 0.192, + "step": 4760, + "task_loss": 0.5335832834243774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20000815391540527, + "epoch": 1.72, + "learning_rate": 1.6198478358272834e-06, + "loss": 0.1739, + "step": 4770, + "task_loss": 0.38792669773101807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1575516015291214, + "epoch": 1.73, + "learning_rate": 1.6183617992477161e-06, + "loss": 0.1754, + "step": 4780, + "task_loss": 0.6262178421020508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12828651070594788, + "epoch": 1.73, + "learning_rate": 1.6168735484446833e-06, + "loss": 0.1726, + "step": 4790, + "task_loss": 0.4547956585884094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14732638001441956, + "epoch": 1.73, + "learning_rate": 1.6153830887472983e-06, + "loss": 0.175, + "step": 4800, + "task_loss": 0.30394428968429565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13601407408714294, + "epoch": 1.74, + "learning_rate": 1.6138904254925831e-06, + "loss": 0.1766, + "step": 4810, + "task_loss": 0.3303675651550293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16339826583862305, + "epoch": 1.74, + "learning_rate": 1.612395564025451e-06, + "loss": 0.1643, + "step": 4820, + "task_loss": 0.4129944443702698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1557186394929886, + "epoch": 1.75, + "learning_rate": 1.6108985096986862e-06, + "loss": 0.1655, + "step": 4830, + "task_loss": 0.3643401563167572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16704073548316956, + "epoch": 1.75, + "learning_rate": 1.6093992678729252e-06, + "loss": 0.1857, + "step": 4840, + "task_loss": 0.33650100231170654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1716494858264923, + "epoch": 1.75, + "learning_rate": 1.6078978439166372e-06, + "loss": 0.18, + "step": 4850, + "task_loss": 0.47747802734375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13160017132759094, + "epoch": 1.76, + "learning_rate": 1.6063942432061062e-06, + "loss": 0.1681, + "step": 4860, + "task_loss": 0.3302234411239624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14614957571029663, + "epoch": 1.76, + "learning_rate": 1.6048884711254086e-06, + "loss": 0.1657, + "step": 4870, + "task_loss": 0.4074876010417938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13879385590553284, + "epoch": 1.76, + "learning_rate": 1.6033805330663987e-06, + "loss": 0.1656, + "step": 4880, + "task_loss": 0.5125229358673096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1442992389202118, + "epoch": 1.77, + "learning_rate": 1.6018704344286844e-06, + "loss": 0.1663, + "step": 4890, + "task_loss": 0.33929747343063354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17423328757286072, + "epoch": 1.77, + "learning_rate": 1.6003581806196117e-06, + "loss": 0.1806, + "step": 4900, + "task_loss": 0.6483819484710693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16412119567394257, + "epoch": 1.77, + "learning_rate": 1.5988437770542426e-06, + "loss": 0.178, + "step": 4910, + "task_loss": 0.37893688678741455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13276568055152893, + "epoch": 1.78, + "learning_rate": 1.5973272291553381e-06, + "loss": 0.1658, + "step": 4920, + "task_loss": 0.5486671924591064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1630297601222992, + "epoch": 1.78, + "learning_rate": 1.5958085423533367e-06, + "loss": 0.1533, + "step": 4930, + "task_loss": 0.3111371695995331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15084078907966614, + "epoch": 1.79, + "learning_rate": 1.5942877220863367e-06, + "loss": 0.156, + "step": 4940, + "task_loss": 0.4554649889469147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1458296775817871, + "epoch": 1.79, + "learning_rate": 1.592764773800075e-06, + "loss": 0.1713, + "step": 4950, + "task_loss": 0.24363714456558228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19936226308345795, + "epoch": 1.79, + "learning_rate": 1.5912397029479088e-06, + "loss": 0.179, + "step": 4960, + "task_loss": 0.42103761434555054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15008442103862762, + "epoch": 1.8, + "learning_rate": 1.5897125149907961e-06, + "loss": 0.1738, + "step": 4970, + "task_loss": 0.6863405704498291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1526724100112915, + "epoch": 1.8, + "learning_rate": 1.5881832153972757e-06, + "loss": 0.1757, + "step": 4980, + "task_loss": 0.24165448546409607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15206190943717957, + "epoch": 1.8, + "learning_rate": 1.586651809643447e-06, + "loss": 0.1781, + "step": 4990, + "task_loss": 0.46479007601737976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13848325610160828, + "epoch": 1.81, + "learning_rate": 1.5851183032129524e-06, + "loss": 0.1767, + "step": 5000, + "task_loss": 0.5114116668701172 + }, + { + "epoch": 1.81, + "eval_exact_match": 83.69914853358561, + "eval_f1": 90.02717757860586, + "step": 5000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15613943338394165, + "epoch": 1.81, + "learning_rate": 1.5835827015969554e-06, + "loss": 0.1747, + "step": 5010, + "task_loss": 0.38822025060653687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18799282610416412, + "epoch": 1.81, + "learning_rate": 1.5820450102941225e-06, + "loss": 0.1926, + "step": 5020, + "task_loss": 0.5233188271522522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15289965271949768, + "epoch": 1.82, + "learning_rate": 1.5805052348106021e-06, + "loss": 0.1647, + "step": 5030, + "task_loss": 0.30817002058029175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19276724755764008, + "epoch": 1.82, + "learning_rate": 1.5789633806600064e-06, + "loss": 0.1794, + "step": 5040, + "task_loss": 0.6410900950431824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1649995744228363, + "epoch": 1.83, + "learning_rate": 1.5774194533633908e-06, + "loss": 0.1672, + "step": 5050, + "task_loss": 0.4419615864753723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1605244129896164, + "epoch": 1.83, + "learning_rate": 1.5758734584492338e-06, + "loss": 0.1971, + "step": 5060, + "task_loss": 0.5084717273712158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18353916704654694, + "epoch": 1.83, + "learning_rate": 1.574325401453418e-06, + "loss": 0.1884, + "step": 5070, + "task_loss": 0.46484851837158203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1702931821346283, + "epoch": 1.84, + "learning_rate": 1.5727752879192093e-06, + "loss": 0.1876, + "step": 5080, + "task_loss": 0.4551842212677002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17121371626853943, + "epoch": 1.84, + "learning_rate": 1.5712231233972386e-06, + "loss": 0.1634, + "step": 5090, + "task_loss": 0.23131704330444336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12615904211997986, + "epoch": 1.84, + "learning_rate": 1.5696689134454802e-06, + "loss": 0.1659, + "step": 5100, + "task_loss": 0.4349666237831116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1561717838048935, + "epoch": 1.85, + "learning_rate": 1.5681126636292326e-06, + "loss": 0.182, + "step": 5110, + "task_loss": 0.5192071199417114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16207730770111084, + "epoch": 1.85, + "learning_rate": 1.5665543795210989e-06, + "loss": 0.1699, + "step": 5120, + "task_loss": 1.110886573791504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12102855741977692, + "epoch": 1.85, + "learning_rate": 1.564994066700967e-06, + "loss": 0.1718, + "step": 5130, + "task_loss": 0.39551660418510437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16230420768260956, + "epoch": 1.86, + "learning_rate": 1.5634317307559882e-06, + "loss": 0.1865, + "step": 5140, + "task_loss": 0.47860944271087646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.183951735496521, + "epoch": 1.86, + "learning_rate": 1.561867377280559e-06, + "loss": 0.174, + "step": 5150, + "task_loss": 0.3939540684223175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16964015364646912, + "epoch": 1.86, + "learning_rate": 1.5603010118762997e-06, + "loss": 0.1748, + "step": 5160, + "task_loss": 0.4153340458869934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1425359845161438, + "epoch": 1.87, + "learning_rate": 1.5587326401520357e-06, + "loss": 0.1766, + "step": 5170, + "task_loss": 0.4296700954437256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2204303741455078, + "epoch": 1.87, + "learning_rate": 1.5571622677237754e-06, + "loss": 0.1859, + "step": 5180, + "task_loss": 0.3614335060119629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18071290850639343, + "epoch": 1.88, + "learning_rate": 1.5555899002146928e-06, + "loss": 0.1698, + "step": 5190, + "task_loss": 0.4687485098838806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.174323171377182, + "epoch": 1.88, + "learning_rate": 1.5540155432551041e-06, + "loss": 0.172, + "step": 5200, + "task_loss": 0.45254212617874146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14772723615169525, + "epoch": 1.88, + "learning_rate": 1.5524392024824508e-06, + "loss": 0.1628, + "step": 5210, + "task_loss": 0.2734772562980652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1434626281261444, + "epoch": 1.89, + "learning_rate": 1.5508608835412773e-06, + "loss": 0.1659, + "step": 5220, + "task_loss": 0.3382406532764435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1506517231464386, + "epoch": 1.89, + "learning_rate": 1.5492805920832117e-06, + "loss": 0.1697, + "step": 5230, + "task_loss": 0.5476049184799194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19249403476715088, + "epoch": 1.89, + "learning_rate": 1.5476983337669451e-06, + "loss": 0.1781, + "step": 5240, + "task_loss": 0.4856623113155365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16049784421920776, + "epoch": 1.9, + "learning_rate": 1.5461141142582115e-06, + "loss": 0.1653, + "step": 5250, + "task_loss": 0.2884122133255005 + }, + { + "epoch": 1.9, + "eval_exact_match": 83.64238410596026, + "eval_f1": 90.02509905020891, + "step": 5250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.145027756690979, + "epoch": 1.9, + "learning_rate": 1.5445279392297672e-06, + "loss": 0.1717, + "step": 5260, + "task_loss": 0.27939456701278687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20636796951293945, + "epoch": 1.9, + "learning_rate": 1.5429398143613717e-06, + "loss": 0.1707, + "step": 5270, + "task_loss": 0.3808209002017975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19634541869163513, + "epoch": 1.91, + "learning_rate": 1.5413497453397658e-06, + "loss": 0.1781, + "step": 5280, + "task_loss": 0.8501614332199097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13850626349449158, + "epoch": 1.91, + "learning_rate": 1.5397577378586514e-06, + "loss": 0.1751, + "step": 5290, + "task_loss": 0.5088834762573242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1705392599105835, + "epoch": 1.92, + "learning_rate": 1.5381637976186733e-06, + "loss": 0.1822, + "step": 5300, + "task_loss": 0.453264057636261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14243380725383759, + "epoch": 1.92, + "learning_rate": 1.5365679303273956e-06, + "loss": 0.1858, + "step": 5310, + "task_loss": 0.8157292604446411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14648321270942688, + "epoch": 1.92, + "learning_rate": 1.5349701416992828e-06, + "loss": 0.1604, + "step": 5320, + "task_loss": 0.44204074144363403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13808469474315643, + "epoch": 1.93, + "learning_rate": 1.5333704374556802e-06, + "loss": 0.1723, + "step": 5330, + "task_loss": 0.286069393157959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15084746479988098, + "epoch": 1.93, + "learning_rate": 1.5317688233247918e-06, + "loss": 0.1631, + "step": 5340, + "task_loss": 0.4517083764076233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16623607277870178, + "epoch": 1.93, + "learning_rate": 1.5301653050416607e-06, + "loss": 0.1859, + "step": 5350, + "task_loss": 0.34950506687164307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15999534726142883, + "epoch": 1.94, + "learning_rate": 1.5285598883481488e-06, + "loss": 0.1751, + "step": 5360, + "task_loss": 0.28156834840774536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15565112233161926, + "epoch": 1.94, + "learning_rate": 1.526952578992915e-06, + "loss": 0.1744, + "step": 5370, + "task_loss": 0.561008095741272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19666364789009094, + "epoch": 1.94, + "learning_rate": 1.5253433827313959e-06, + "loss": 0.187, + "step": 5380, + "task_loss": 0.5248037576675415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12654519081115723, + "epoch": 1.95, + "learning_rate": 1.5237323053257849e-06, + "loss": 0.1773, + "step": 5390, + "task_loss": 0.3182612359523773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1488625556230545, + "epoch": 1.95, + "learning_rate": 1.5221193525450105e-06, + "loss": 0.185, + "step": 5400, + "task_loss": 0.40940284729003906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17843294143676758, + "epoch": 1.96, + "learning_rate": 1.5205045301647176e-06, + "loss": 0.1733, + "step": 5410, + "task_loss": 0.4217627942562103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13039106130599976, + "epoch": 1.96, + "learning_rate": 1.5188878439672456e-06, + "loss": 0.161, + "step": 5420, + "task_loss": 0.6658985614776611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15170055627822876, + "epoch": 1.96, + "learning_rate": 1.5172692997416074e-06, + "loss": 0.1821, + "step": 5430, + "task_loss": 0.37789255380630493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1514560878276825, + "epoch": 1.97, + "learning_rate": 1.5156489032834689e-06, + "loss": 0.1965, + "step": 5440, + "task_loss": 0.40159061551094055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13064901530742645, + "epoch": 1.97, + "learning_rate": 1.5140266603951288e-06, + "loss": 0.174, + "step": 5450, + "task_loss": 0.4346536099910736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19731932878494263, + "epoch": 1.97, + "learning_rate": 1.5124025768854975e-06, + "loss": 0.1911, + "step": 5460, + "task_loss": 0.5681113600730896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1736607700586319, + "epoch": 1.98, + "learning_rate": 1.5107766585700765e-06, + "loss": 0.1773, + "step": 5470, + "task_loss": 0.40363502502441406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18201759457588196, + "epoch": 1.98, + "learning_rate": 1.509148911270937e-06, + "loss": 0.1708, + "step": 5480, + "task_loss": 0.42472249269485474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1587354838848114, + "epoch": 1.98, + "learning_rate": 1.5075193408166995e-06, + "loss": 0.1696, + "step": 5490, + "task_loss": 0.5016087889671326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14640197157859802, + "epoch": 1.99, + "learning_rate": 1.5058879530425129e-06, + "loss": 0.1814, + "step": 5500, + "task_loss": 0.9462409019470215 + }, + { + "epoch": 1.99, + "eval_exact_match": 83.79375591296122, + "eval_f1": 90.06572493808123, + "step": 5500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17614376544952393, + "epoch": 1.99, + "learning_rate": 1.5042547537900334e-06, + "loss": 0.1724, + "step": 5510, + "task_loss": 0.5415596961975098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16879351437091827, + "epoch": 1.99, + "learning_rate": 1.5026197489074038e-06, + "loss": 0.1792, + "step": 5520, + "task_loss": 0.7170099020004272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13919247686862946, + "epoch": 2.0, + "learning_rate": 1.5009829442492321e-06, + "loss": 0.1896, + "step": 5530, + "task_loss": 0.37363719940185547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13153812289237976, + "epoch": 2.0, + "learning_rate": 1.4993443456765722e-06, + "loss": 0.1611, + "step": 5540, + "task_loss": 0.27757397294044495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20778411626815796, + "epoch": 2.01, + "learning_rate": 1.4977039590569e-06, + "loss": 0.1793, + "step": 5550, + "task_loss": 0.8411521315574646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17973269522190094, + "epoch": 2.01, + "learning_rate": 1.4960617902640954e-06, + "loss": 0.1709, + "step": 5560, + "task_loss": 0.396151065826416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1556098610162735, + "epoch": 2.01, + "learning_rate": 1.4944178451784185e-06, + "loss": 0.1685, + "step": 5570, + "task_loss": 0.6039637327194214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1373765766620636, + "epoch": 2.02, + "learning_rate": 1.4927721296864911e-06, + "loss": 0.1596, + "step": 5580, + "task_loss": 0.4312272071838379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11067195981740952, + "epoch": 2.02, + "learning_rate": 1.4911246496812736e-06, + "loss": 0.1822, + "step": 5590, + "task_loss": 0.2303856760263443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14561320841312408, + "epoch": 2.02, + "learning_rate": 1.4894754110620462e-06, + "loss": 0.1854, + "step": 5600, + "task_loss": 0.46308350563049316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16818490624427795, + "epoch": 2.03, + "learning_rate": 1.4878244197343843e-06, + "loss": 0.1812, + "step": 5610, + "task_loss": 0.76872318983078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17642438411712646, + "epoch": 2.03, + "learning_rate": 1.4861716816101408e-06, + "loss": 0.1649, + "step": 5620, + "task_loss": 0.31745028495788574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1748821884393692, + "epoch": 2.03, + "learning_rate": 1.4845172026074229e-06, + "loss": 0.181, + "step": 5630, + "task_loss": 0.5924056172370911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1267927885055542, + "epoch": 2.04, + "learning_rate": 1.4828609886505719e-06, + "loss": 0.1638, + "step": 5640, + "task_loss": 0.3259366750717163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19988219439983368, + "epoch": 2.04, + "learning_rate": 1.4812030456701412e-06, + "loss": 0.1733, + "step": 5650, + "task_loss": 0.5508837699890137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15306976437568665, + "epoch": 2.05, + "learning_rate": 1.4795433796028758e-06, + "loss": 0.1812, + "step": 5660, + "task_loss": 0.22895438969135284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1771416962146759, + "epoch": 2.05, + "learning_rate": 1.4778819963916909e-06, + "loss": 0.172, + "step": 5670, + "task_loss": 0.6265113949775696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21116046607494354, + "epoch": 2.05, + "learning_rate": 1.4762189019856499e-06, + "loss": 0.1802, + "step": 5680, + "task_loss": 0.5534864664077759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18786293268203735, + "epoch": 2.06, + "learning_rate": 1.4745541023399435e-06, + "loss": 0.1887, + "step": 5690, + "task_loss": 0.6557093858718872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20581617951393127, + "epoch": 2.06, + "learning_rate": 1.4728876034158692e-06, + "loss": 0.1742, + "step": 5700, + "task_loss": 0.4462522268295288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13658413290977478, + "epoch": 2.06, + "learning_rate": 1.4712194111808093e-06, + "loss": 0.1734, + "step": 5710, + "task_loss": 0.4286487102508545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17904964089393616, + "epoch": 2.07, + "learning_rate": 1.4695495316082085e-06, + "loss": 0.1724, + "step": 5720, + "task_loss": 0.5003844499588013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1682061105966568, + "epoch": 2.07, + "learning_rate": 1.4678779706775547e-06, + "loss": 0.1732, + "step": 5730, + "task_loss": 0.4084094762802124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14257608354091644, + "epoch": 2.07, + "learning_rate": 1.466204734374355e-06, + "loss": 0.17, + "step": 5740, + "task_loss": 0.25102728605270386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16918689012527466, + "epoch": 2.08, + "learning_rate": 1.4645298286901168e-06, + "loss": 0.1728, + "step": 5750, + "task_loss": 0.3457295298576355 + }, + { + "epoch": 2.08, + "eval_exact_match": 83.74645222327341, + "eval_f1": 89.96027080749701, + "step": 5750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14827761054039001, + "epoch": 2.08, + "learning_rate": 1.4628532596223252e-06, + "loss": 0.1802, + "step": 5760, + "task_loss": 0.2933032512664795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14048659801483154, + "epoch": 2.09, + "learning_rate": 1.461175033174421e-06, + "loss": 0.1628, + "step": 5770, + "task_loss": 0.4381973147392273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14933642745018005, + "epoch": 2.09, + "learning_rate": 1.45949515535578e-06, + "loss": 0.1618, + "step": 5780, + "task_loss": 0.7806057929992676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15557800233364105, + "epoch": 2.09, + "learning_rate": 1.4578136321816908e-06, + "loss": 0.1717, + "step": 5790, + "task_loss": 0.554169774055481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13000071048736572, + "epoch": 2.1, + "learning_rate": 1.4561304696733342e-06, + "loss": 0.1656, + "step": 5800, + "task_loss": 0.22170893847942352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18678736686706543, + "epoch": 2.1, + "learning_rate": 1.4544456738577608e-06, + "loss": 0.1789, + "step": 5810, + "task_loss": 0.2693910300731659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16203993558883667, + "epoch": 2.1, + "learning_rate": 1.4527592507678702e-06, + "loss": 0.1692, + "step": 5820, + "task_loss": 0.4591533839702606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17562934756278992, + "epoch": 2.11, + "learning_rate": 1.4510712064423883e-06, + "loss": 0.1771, + "step": 5830, + "task_loss": 0.42079657316207886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19123172760009766, + "epoch": 2.11, + "learning_rate": 1.4493815469258466e-06, + "loss": 0.1712, + "step": 5840, + "task_loss": 0.33686795830726624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13670551776885986, + "epoch": 2.11, + "learning_rate": 1.4476902782685603e-06, + "loss": 0.1687, + "step": 5850, + "task_loss": 0.40342459082603455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12456159293651581, + "epoch": 2.12, + "learning_rate": 1.4459974065266062e-06, + "loss": 0.1907, + "step": 5860, + "task_loss": 0.29705381393432617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15771573781967163, + "epoch": 2.12, + "learning_rate": 1.444302937761802e-06, + "loss": 0.1684, + "step": 5870, + "task_loss": 0.5629597306251526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1410112977027893, + "epoch": 2.13, + "learning_rate": 1.442606878041684e-06, + "loss": 0.1741, + "step": 5880, + "task_loss": 0.3297852873802185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18847446143627167, + "epoch": 2.13, + "learning_rate": 1.4409092334394845e-06, + "loss": 0.1759, + "step": 5890, + "task_loss": 0.48122259974479675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15323123335838318, + "epoch": 2.13, + "learning_rate": 1.439210010034112e-06, + "loss": 0.1687, + "step": 5900, + "task_loss": 0.3984212875366211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1503305733203888, + "epoch": 2.14, + "learning_rate": 1.4375092139101279e-06, + "loss": 0.1773, + "step": 5910, + "task_loss": 0.5217478275299072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16366368532180786, + "epoch": 2.14, + "learning_rate": 1.4358068511577248e-06, + "loss": 0.1824, + "step": 5920, + "task_loss": 0.7586253881454468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13122406601905823, + "epoch": 2.14, + "learning_rate": 1.434102927872706e-06, + "loss": 0.167, + "step": 5930, + "task_loss": 0.274662584066391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17711997032165527, + "epoch": 2.15, + "learning_rate": 1.4323974501564617e-06, + "loss": 0.1627, + "step": 5940, + "task_loss": 0.5237435102462769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1662512719631195, + "epoch": 2.15, + "learning_rate": 1.4306904241159488e-06, + "loss": 0.1883, + "step": 5950, + "task_loss": 0.2580040693283081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15990257263183594, + "epoch": 2.15, + "learning_rate": 1.4289818558636686e-06, + "loss": 0.1749, + "step": 5960, + "task_loss": 0.5619537830352783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18278679251670837, + "epoch": 2.16, + "learning_rate": 1.4272717515176443e-06, + "loss": 0.1619, + "step": 5970, + "task_loss": 0.32770949602127075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18862998485565186, + "epoch": 2.16, + "learning_rate": 1.425560117201399e-06, + "loss": 0.1882, + "step": 5980, + "task_loss": 0.4872078597545624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13203898072242737, + "epoch": 2.16, + "learning_rate": 1.4238469590439358e-06, + "loss": 0.1634, + "step": 5990, + "task_loss": 0.38844966888427734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14819516241550446, + "epoch": 2.17, + "learning_rate": 1.4221322831797133e-06, + "loss": 0.1705, + "step": 6000, + "task_loss": 0.5028584003448486 + }, + { + "epoch": 2.17, + "eval_exact_match": 83.67076631977294, + "eval_f1": 89.97577151956914, + "step": 6000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18577471375465393, + "epoch": 2.17, + "learning_rate": 1.420416095748625e-06, + "loss": 0.1704, + "step": 6010, + "task_loss": 0.30688726902008057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1723865419626236, + "epoch": 2.18, + "learning_rate": 1.4186984028959766e-06, + "loss": 0.1799, + "step": 6020, + "task_loss": 0.42208823561668396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14269012212753296, + "epoch": 2.18, + "learning_rate": 1.4169792107724647e-06, + "loss": 0.161, + "step": 6030, + "task_loss": 0.3946291506290436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17888927459716797, + "epoch": 2.18, + "learning_rate": 1.4152585255341547e-06, + "loss": 0.1647, + "step": 6040, + "task_loss": 0.4846251904964447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12159587442874908, + "epoch": 2.19, + "learning_rate": 1.4135363533424585e-06, + "loss": 0.168, + "step": 6050, + "task_loss": 0.4344036281108856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15208034217357635, + "epoch": 2.19, + "learning_rate": 1.4118127003641116e-06, + "loss": 0.1769, + "step": 6060, + "task_loss": 0.43158042430877686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1561131477355957, + "epoch": 2.19, + "learning_rate": 1.4100875727711533e-06, + "loss": 0.1698, + "step": 6070, + "task_loss": 0.627993643283844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16031518578529358, + "epoch": 2.2, + "learning_rate": 1.4083609767409019e-06, + "loss": 0.1599, + "step": 6080, + "task_loss": 0.21831873059272766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17668455839157104, + "epoch": 2.2, + "learning_rate": 1.406632918455935e-06, + "loss": 0.1931, + "step": 6090, + "task_loss": 0.29211580753326416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17368084192276, + "epoch": 2.2, + "learning_rate": 1.4049034041040647e-06, + "loss": 0.1753, + "step": 6100, + "task_loss": 0.44469308853149414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12126050889492035, + "epoch": 2.21, + "learning_rate": 1.4031724398783192e-06, + "loss": 0.1514, + "step": 6110, + "task_loss": 0.26887062191963196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16346772015094757, + "epoch": 2.21, + "learning_rate": 1.401440031976916e-06, + "loss": 0.1721, + "step": 6120, + "task_loss": 0.5284141302108765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1427915096282959, + "epoch": 2.22, + "learning_rate": 1.3997061866032439e-06, + "loss": 0.1686, + "step": 6130, + "task_loss": 0.5118228793144226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1318717747926712, + "epoch": 2.22, + "learning_rate": 1.3979709099658376e-06, + "loss": 0.1831, + "step": 6140, + "task_loss": 0.41089510917663574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1968536078929901, + "epoch": 2.22, + "learning_rate": 1.3962342082783582e-06, + "loss": 0.1758, + "step": 6150, + "task_loss": 0.5051559209823608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15244260430335999, + "epoch": 2.23, + "learning_rate": 1.3944960877595684e-06, + "loss": 0.1816, + "step": 6160, + "task_loss": 0.3118957281112671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13998454809188843, + "epoch": 2.23, + "learning_rate": 1.3927565546333123e-06, + "loss": 0.1625, + "step": 6170, + "task_loss": 0.556923508644104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13642212748527527, + "epoch": 2.23, + "learning_rate": 1.391015615128492e-06, + "loss": 0.1645, + "step": 6180, + "task_loss": 0.35751280188560486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1508575975894928, + "epoch": 2.24, + "learning_rate": 1.3892732754790455e-06, + "loss": 0.1742, + "step": 6190, + "task_loss": 0.41975730657577515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.10277068614959717, + "epoch": 2.24, + "learning_rate": 1.3875295419239242e-06, + "loss": 0.1581, + "step": 6200, + "task_loss": 0.3505735695362091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21870091557502747, + "epoch": 2.24, + "learning_rate": 1.385784420707071e-06, + "loss": 0.1818, + "step": 6210, + "task_loss": 0.4473569691181183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1878502368927002, + "epoch": 2.25, + "learning_rate": 1.3840379180773975e-06, + "loss": 0.1768, + "step": 6220, + "task_loss": 0.47854381799697876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15077124536037445, + "epoch": 2.25, + "learning_rate": 1.3822900402887626e-06, + "loss": 0.1688, + "step": 6230, + "task_loss": 0.26176613569259644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18990695476531982, + "epoch": 2.26, + "learning_rate": 1.3805407935999482e-06, + "loss": 0.1832, + "step": 6240, + "task_loss": 0.546806275844574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1040068119764328, + "epoch": 2.26, + "learning_rate": 1.378790184274639e-06, + "loss": 0.1759, + "step": 6250, + "task_loss": 0.09956555813550949 + }, + { + "epoch": 2.26, + "eval_exact_match": 83.85998107852413, + "eval_f1": 90.12192979235915, + "step": 6250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.159162700176239, + "epoch": 2.26, + "learning_rate": 1.3770382185813986e-06, + "loss": 0.1785, + "step": 6260, + "task_loss": 0.2582295536994934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16721557080745697, + "epoch": 2.27, + "learning_rate": 1.3752849027936473e-06, + "loss": 0.167, + "step": 6270, + "task_loss": 0.3453947603702545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14433017373085022, + "epoch": 2.27, + "learning_rate": 1.3735302431896396e-06, + "loss": 0.167, + "step": 6280, + "task_loss": 0.22514958679676056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20126046240329742, + "epoch": 2.27, + "learning_rate": 1.3717742460524429e-06, + "loss": 0.1887, + "step": 6290, + "task_loss": 0.6400581002235413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21604719758033752, + "epoch": 2.28, + "learning_rate": 1.3700169176699125e-06, + "loss": 0.1939, + "step": 6300, + "task_loss": 0.35505056381225586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1490190625190735, + "epoch": 2.28, + "learning_rate": 1.3682582643346728e-06, + "loss": 0.173, + "step": 6310, + "task_loss": 0.566085934638977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12886354327201843, + "epoch": 2.28, + "learning_rate": 1.36649829234409e-06, + "loss": 0.1792, + "step": 6320, + "task_loss": 0.32131779193878174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14631399512290955, + "epoch": 2.29, + "learning_rate": 1.3647370080002541e-06, + "loss": 0.1629, + "step": 6330, + "task_loss": 0.3405493497848511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13580074906349182, + "epoch": 2.29, + "learning_rate": 1.3629744176099535e-06, + "loss": 0.1617, + "step": 6340, + "task_loss": 0.2829074263572693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16273626685142517, + "epoch": 2.29, + "learning_rate": 1.3612105274846538e-06, + "loss": 0.174, + "step": 6350, + "task_loss": 0.34644442796707153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16554933786392212, + "epoch": 2.3, + "learning_rate": 1.3594453439404733e-06, + "loss": 0.1848, + "step": 6360, + "task_loss": 0.40179452300071716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17596763372421265, + "epoch": 2.3, + "learning_rate": 1.357678873298164e-06, + "loss": 0.1648, + "step": 6370, + "task_loss": 0.5932073593139648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15840421617031097, + "epoch": 2.31, + "learning_rate": 1.3559111218830848e-06, + "loss": 0.1796, + "step": 6380, + "task_loss": 0.6633092761039734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16752852499485016, + "epoch": 2.31, + "learning_rate": 1.3541420960251813e-06, + "loss": 0.1728, + "step": 6390, + "task_loss": 0.6710745096206665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1526416689157486, + "epoch": 2.31, + "learning_rate": 1.3523718020589634e-06, + "loss": 0.1717, + "step": 6400, + "task_loss": 0.21283848583698273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14025653898715973, + "epoch": 2.32, + "learning_rate": 1.3506002463234811e-06, + "loss": 0.1718, + "step": 6410, + "task_loss": 0.22466395795345306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14743945002555847, + "epoch": 2.32, + "learning_rate": 1.348827435162302e-06, + "loss": 0.1761, + "step": 6420, + "task_loss": 0.40274888277053833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18202725052833557, + "epoch": 2.32, + "learning_rate": 1.3470533749234906e-06, + "loss": 0.169, + "step": 6430, + "task_loss": 0.7552968859672546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17869505286216736, + "epoch": 2.33, + "learning_rate": 1.3452780719595831e-06, + "loss": 0.1666, + "step": 6440, + "task_loss": 0.2956041097640991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1480785310268402, + "epoch": 2.33, + "learning_rate": 1.3435015326275654e-06, + "loss": 0.1684, + "step": 6450, + "task_loss": 0.3291912376880646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15362969040870667, + "epoch": 2.33, + "learning_rate": 1.3417237632888513e-06, + "loss": 0.1661, + "step": 6460, + "task_loss": 0.38771361112594604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1606786698102951, + "epoch": 2.34, + "learning_rate": 1.3399447703092584e-06, + "loss": 0.176, + "step": 6470, + "task_loss": 0.5447548627853394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14728298783302307, + "epoch": 2.34, + "learning_rate": 1.3381645600589865e-06, + "loss": 0.1682, + "step": 6480, + "task_loss": 0.4820564091205597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16725045442581177, + "epoch": 2.35, + "learning_rate": 1.3363831389125936e-06, + "loss": 0.1777, + "step": 6490, + "task_loss": 0.41131776571273804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16478011012077332, + "epoch": 2.35, + "learning_rate": 1.3346005132489739e-06, + "loss": 0.1756, + "step": 6500, + "task_loss": 0.3583562970161438 + }, + { + "epoch": 2.35, + "eval_exact_match": 83.61400189214758, + "eval_f1": 89.97690778977226, + "step": 6500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21100406348705292, + "epoch": 2.35, + "learning_rate": 1.3328166894513346e-06, + "loss": 0.1659, + "step": 6510, + "task_loss": 0.41332000494003296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14449000358581543, + "epoch": 2.36, + "learning_rate": 1.3310316739071738e-06, + "loss": 0.1687, + "step": 6520, + "task_loss": 0.28724896907806396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15244629979133606, + "epoch": 2.36, + "learning_rate": 1.329245473008256e-06, + "loss": 0.1682, + "step": 6530, + "task_loss": 0.37325456738471985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1374513804912567, + "epoch": 2.36, + "learning_rate": 1.3274580931505911e-06, + "loss": 0.1838, + "step": 6540, + "task_loss": 0.47368472814559937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1499250829219818, + "epoch": 2.37, + "learning_rate": 1.3256695407344103e-06, + "loss": 0.1662, + "step": 6550, + "task_loss": 0.4007405638694763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17637519538402557, + "epoch": 2.37, + "learning_rate": 1.3238798221641427e-06, + "loss": 0.1872, + "step": 6560, + "task_loss": 0.4767989218235016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19467362761497498, + "epoch": 2.37, + "learning_rate": 1.3220889438483944e-06, + "loss": 0.168, + "step": 6570, + "task_loss": 0.5928551554679871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1555582582950592, + "epoch": 2.38, + "learning_rate": 1.3202969121999234e-06, + "loss": 0.1708, + "step": 6580, + "task_loss": 0.5098388195037842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13599567115306854, + "epoch": 2.38, + "learning_rate": 1.3185037336356182e-06, + "loss": 0.1658, + "step": 6590, + "task_loss": 0.3146175146102905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1555548906326294, + "epoch": 2.39, + "learning_rate": 1.316709414576474e-06, + "loss": 0.1726, + "step": 6600, + "task_loss": 0.5938575863838196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12219253927469254, + "epoch": 2.39, + "learning_rate": 1.3149139614475693e-06, + "loss": 0.174, + "step": 6610, + "task_loss": 0.3089248538017273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1555480659008026, + "epoch": 2.39, + "learning_rate": 1.3131173806780443e-06, + "loss": 0.1676, + "step": 6620, + "task_loss": 0.42747241258621216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17220956087112427, + "epoch": 2.4, + "learning_rate": 1.311319678701076e-06, + "loss": 0.1887, + "step": 6630, + "task_loss": 0.4911194443702698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.197723388671875, + "epoch": 2.4, + "learning_rate": 1.3095208619538574e-06, + "loss": 0.1714, + "step": 6640, + "task_loss": 0.36501121520996094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14832803606987, + "epoch": 2.4, + "learning_rate": 1.3077209368775724e-06, + "loss": 0.1724, + "step": 6650, + "task_loss": 0.4433749318122864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17430132627487183, + "epoch": 2.41, + "learning_rate": 1.3059199099173741e-06, + "loss": 0.1818, + "step": 6660, + "task_loss": 0.5781666040420532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16755987703800201, + "epoch": 2.41, + "learning_rate": 1.3041177875223612e-06, + "loss": 0.1678, + "step": 6670, + "task_loss": 0.6399859189987183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16882233321666718, + "epoch": 2.41, + "learning_rate": 1.302314576145554e-06, + "loss": 0.1698, + "step": 6680, + "task_loss": 0.6496683359146118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1583394557237625, + "epoch": 2.42, + "learning_rate": 1.3005102822438738e-06, + "loss": 0.1789, + "step": 6690, + "task_loss": 0.32157063484191895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13417063653469086, + "epoch": 2.42, + "learning_rate": 1.2987049122781171e-06, + "loss": 0.1802, + "step": 6700, + "task_loss": 0.2793649435043335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15821221470832825, + "epoch": 2.43, + "learning_rate": 1.2968984727129332e-06, + "loss": 0.1818, + "step": 6710, + "task_loss": 0.34716230630874634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16095976531505585, + "epoch": 2.43, + "learning_rate": 1.295090970016803e-06, + "loss": 0.1825, + "step": 6720, + "task_loss": 0.19834813475608826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14987853169441223, + "epoch": 2.43, + "learning_rate": 1.2932824106620125e-06, + "loss": 0.1635, + "step": 6730, + "task_loss": 0.7775259613990784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16203686594963074, + "epoch": 2.44, + "learning_rate": 1.291472801124632e-06, + "loss": 0.1696, + "step": 6740, + "task_loss": 0.39739635586738586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14740723371505737, + "epoch": 2.44, + "learning_rate": 1.2896621478844931e-06, + "loss": 0.1799, + "step": 6750, + "task_loss": 0.3119643032550812 + }, + { + "epoch": 2.44, + "eval_exact_match": 83.70860927152317, + "eval_f1": 89.9757275904899, + "step": 6750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18803681433200836, + "epoch": 2.44, + "learning_rate": 1.2878504574251637e-06, + "loss": 0.1717, + "step": 6760, + "task_loss": 0.4526791572570801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17158998548984528, + "epoch": 2.45, + "learning_rate": 1.2860377362339257e-06, + "loss": 0.1711, + "step": 6770, + "task_loss": 0.4078561067581177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13310539722442627, + "epoch": 2.45, + "learning_rate": 1.2842239908017526e-06, + "loss": 0.1676, + "step": 6780, + "task_loss": 0.27124446630477905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.166331946849823, + "epoch": 2.45, + "learning_rate": 1.2824092276232853e-06, + "loss": 0.1686, + "step": 6790, + "task_loss": 0.6495641469955444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16789960861206055, + "epoch": 2.46, + "learning_rate": 1.280593453196808e-06, + "loss": 0.1742, + "step": 6800, + "task_loss": 0.4059957265853882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1527012437582016, + "epoch": 2.46, + "learning_rate": 1.2787766740242277e-06, + "loss": 0.1686, + "step": 6810, + "task_loss": 0.3953001797199249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16180524230003357, + "epoch": 2.46, + "learning_rate": 1.2769588966110476e-06, + "loss": 0.1789, + "step": 6820, + "task_loss": 0.49090176820755005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1586184799671173, + "epoch": 2.47, + "learning_rate": 1.2751401274663463e-06, + "loss": 0.158, + "step": 6830, + "task_loss": 0.24430197477340698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12404391914606094, + "epoch": 2.47, + "learning_rate": 1.2733203731027534e-06, + "loss": 0.1605, + "step": 6840, + "task_loss": 0.253868043422699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17260029911994934, + "epoch": 2.48, + "learning_rate": 1.2714996400364262e-06, + "loss": 0.1541, + "step": 6850, + "task_loss": 0.4611034691333771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1461942195892334, + "epoch": 2.48, + "learning_rate": 1.2696779347870265e-06, + "loss": 0.1741, + "step": 6860, + "task_loss": 0.5523576736450195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12024559080600739, + "epoch": 2.48, + "learning_rate": 1.2678552638776979e-06, + "loss": 0.1755, + "step": 6870, + "task_loss": 0.3268803060054779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18247783184051514, + "epoch": 2.49, + "learning_rate": 1.2660316338350408e-06, + "loss": 0.1767, + "step": 6880, + "task_loss": 0.3315047025680542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21110492944717407, + "epoch": 2.49, + "learning_rate": 1.2642070511890905e-06, + "loss": 0.1858, + "step": 6890, + "task_loss": 0.6726840734481812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13523760437965393, + "epoch": 2.49, + "learning_rate": 1.2623815224732941e-06, + "loss": 0.1731, + "step": 6900, + "task_loss": 0.17781083285808563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13068142533302307, + "epoch": 2.5, + "learning_rate": 1.2605550542244854e-06, + "loss": 0.1613, + "step": 6910, + "task_loss": 0.41651782393455505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13178616762161255, + "epoch": 2.5, + "learning_rate": 1.2587276529828628e-06, + "loss": 0.1858, + "step": 6920, + "task_loss": 0.48381632566452026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1575162410736084, + "epoch": 2.5, + "learning_rate": 1.2568993252919652e-06, + "loss": 0.1753, + "step": 6930, + "task_loss": 0.27174267172813416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21238577365875244, + "epoch": 2.51, + "learning_rate": 1.25507007769865e-06, + "loss": 0.1898, + "step": 6940, + "task_loss": 0.3620792031288147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15523602068424225, + "epoch": 2.51, + "learning_rate": 1.2532399167530674e-06, + "loss": 0.1751, + "step": 6950, + "task_loss": 0.24264416098594666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20205236971378326, + "epoch": 2.52, + "learning_rate": 1.2514088490086387e-06, + "loss": 0.1882, + "step": 6960, + "task_loss": 0.49486780166625977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1760149896144867, + "epoch": 2.52, + "learning_rate": 1.2495768810220321e-06, + "loss": 0.1753, + "step": 6970, + "task_loss": 0.5111973881721497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1396222710609436, + "epoch": 2.52, + "learning_rate": 1.2477440193531393e-06, + "loss": 0.1635, + "step": 6980, + "task_loss": 0.4071466624736786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21526896953582764, + "epoch": 2.53, + "learning_rate": 1.2459102705650523e-06, + "loss": 0.184, + "step": 6990, + "task_loss": 0.43278732895851135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15099546313285828, + "epoch": 2.53, + "learning_rate": 1.24407564122404e-06, + "loss": 0.1687, + "step": 7000, + "task_loss": 0.4598119854927063 + }, + { + "epoch": 2.53, + "eval_exact_match": 83.38694418164617, + "eval_f1": 89.83912379851094, + "step": 7000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13700950145721436, + "epoch": 2.53, + "learning_rate": 1.2422401378995231e-06, + "loss": 0.1684, + "step": 7010, + "task_loss": 0.23394280672073364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19724495708942413, + "epoch": 2.54, + "learning_rate": 1.2404037671640534e-06, + "loss": 0.1905, + "step": 7020, + "task_loss": 0.5948208570480347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17370860278606415, + "epoch": 2.54, + "learning_rate": 1.2385665355932874e-06, + "loss": 0.1626, + "step": 7030, + "task_loss": 0.5977954268455505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15155650675296783, + "epoch": 2.54, + "learning_rate": 1.2367284497659659e-06, + "loss": 0.179, + "step": 7040, + "task_loss": 0.7188633680343628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.26555588841438293, + "epoch": 2.55, + "learning_rate": 1.2348895162638862e-06, + "loss": 0.1875, + "step": 7050, + "task_loss": 0.5506167411804199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1538572609424591, + "epoch": 2.55, + "learning_rate": 1.2330497416718824e-06, + "loss": 0.1718, + "step": 7060, + "task_loss": 0.6333431601524353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17727243900299072, + "epoch": 2.56, + "learning_rate": 1.2312091325778004e-06, + "loss": 0.1747, + "step": 7070, + "task_loss": 0.6085351705551147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1532391905784607, + "epoch": 2.56, + "learning_rate": 1.229367695572474e-06, + "loss": 0.1628, + "step": 7080, + "task_loss": 0.14348584413528442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19841280579566956, + "epoch": 2.56, + "learning_rate": 1.2275254372497012e-06, + "loss": 0.1637, + "step": 7090, + "task_loss": 0.5597392916679382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1571100950241089, + "epoch": 2.57, + "learning_rate": 1.225682364206222e-06, + "loss": 0.1766, + "step": 7100, + "task_loss": 0.44082239270210266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1467963010072708, + "epoch": 2.57, + "learning_rate": 1.2238384830416926e-06, + "loss": 0.1659, + "step": 7110, + "task_loss": 0.31584417819976807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1948840469121933, + "epoch": 2.57, + "learning_rate": 1.2219938003586635e-06, + "loss": 0.1776, + "step": 7120, + "task_loss": 0.32976752519607544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1948937028646469, + "epoch": 2.58, + "learning_rate": 1.2201483227625549e-06, + "loss": 0.1675, + "step": 7130, + "task_loss": 0.6085909008979797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1637001782655716, + "epoch": 2.58, + "learning_rate": 1.2183020568616342e-06, + "loss": 0.1846, + "step": 7140, + "task_loss": 0.47807058691978455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16208887100219727, + "epoch": 2.58, + "learning_rate": 1.2164550092669906e-06, + "loss": 0.177, + "step": 7150, + "task_loss": 0.4254646599292755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13784141838550568, + "epoch": 2.59, + "learning_rate": 1.214607186592513e-06, + "loss": 0.17, + "step": 7160, + "task_loss": 0.6249512434005737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17609910666942596, + "epoch": 2.59, + "learning_rate": 1.212758595454866e-06, + "loss": 0.1824, + "step": 7170, + "task_loss": 0.6437938213348389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1495467573404312, + "epoch": 2.59, + "learning_rate": 1.210909242473464e-06, + "loss": 0.1638, + "step": 7180, + "task_loss": 0.3057920038700104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22128623723983765, + "epoch": 2.6, + "learning_rate": 1.2090591342704523e-06, + "loss": 0.1908, + "step": 7190, + "task_loss": 0.3862370252609253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2089284509420395, + "epoch": 2.6, + "learning_rate": 1.2072082774706783e-06, + "loss": 0.1803, + "step": 7200, + "task_loss": 0.3655293583869934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1751350313425064, + "epoch": 2.61, + "learning_rate": 1.205356678701671e-06, + "loss": 0.1847, + "step": 7210, + "task_loss": 0.6336475610733032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14995576441287994, + "epoch": 2.61, + "learning_rate": 1.2035043445936158e-06, + "loss": 0.1643, + "step": 7220, + "task_loss": 0.5754516124725342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.191546231508255, + "epoch": 2.61, + "learning_rate": 1.201651281779331e-06, + "loss": 0.1847, + "step": 7230, + "task_loss": 0.5577281713485718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17802578210830688, + "epoch": 2.62, + "learning_rate": 1.1997974968942448e-06, + "loss": 0.1745, + "step": 7240, + "task_loss": 0.45092934370040894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1552976369857788, + "epoch": 2.62, + "learning_rate": 1.1979429965763707e-06, + "loss": 0.1731, + "step": 7250, + "task_loss": 0.5546841621398926 + }, + { + "epoch": 2.62, + "eval_exact_match": 83.4720908230842, + "eval_f1": 89.94530706324616, + "step": 7250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12661629915237427, + "epoch": 2.62, + "learning_rate": 1.1960877874662842e-06, + "loss": 0.159, + "step": 7260, + "task_loss": 0.24371370673179626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17779144644737244, + "epoch": 2.63, + "learning_rate": 1.1942318762070984e-06, + "loss": 0.1738, + "step": 7270, + "task_loss": 0.29593953490257263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1766924411058426, + "epoch": 2.63, + "learning_rate": 1.1923752694444413e-06, + "loss": 0.1862, + "step": 7280, + "task_loss": 0.5446988344192505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13910958170890808, + "epoch": 2.63, + "learning_rate": 1.1905179738264307e-06, + "loss": 0.1742, + "step": 7290, + "task_loss": 0.2903839349746704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15694880485534668, + "epoch": 2.64, + "learning_rate": 1.1886599960036514e-06, + "loss": 0.1825, + "step": 7300, + "task_loss": 0.37020811438560486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1348649561405182, + "epoch": 2.64, + "learning_rate": 1.186801342629131e-06, + "loss": 0.1704, + "step": 7310, + "task_loss": 0.31780362129211426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13491874933242798, + "epoch": 2.65, + "learning_rate": 1.184942020358316e-06, + "loss": 0.1796, + "step": 7320, + "task_loss": 0.2244960069656372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1295914649963379, + "epoch": 2.65, + "learning_rate": 1.1830820358490481e-06, + "loss": 0.1742, + "step": 7330, + "task_loss": 0.24627065658569336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1553851217031479, + "epoch": 2.65, + "learning_rate": 1.1812213957615407e-06, + "loss": 0.1855, + "step": 7340, + "task_loss": 0.3259097635746002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16126763820648193, + "epoch": 2.66, + "learning_rate": 1.179360106758354e-06, + "loss": 0.1701, + "step": 7350, + "task_loss": 0.48443925380706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16280007362365723, + "epoch": 2.66, + "learning_rate": 1.1774981755043721e-06, + "loss": 0.1779, + "step": 7360, + "task_loss": 0.5628104209899902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2028045356273651, + "epoch": 2.66, + "learning_rate": 1.1756356086667795e-06, + "loss": 0.1779, + "step": 7370, + "task_loss": 0.4160672724246979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13652898371219635, + "epoch": 2.67, + "learning_rate": 1.1737724129150357e-06, + "loss": 0.1713, + "step": 7380, + "task_loss": 0.46745267510414124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18952497839927673, + "epoch": 2.67, + "learning_rate": 1.1719085949208525e-06, + "loss": 0.1726, + "step": 7390, + "task_loss": 0.7140260934829712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1258544772863388, + "epoch": 2.67, + "learning_rate": 1.1700441613581702e-06, + "loss": 0.1553, + "step": 7400, + "task_loss": 0.26404812932014465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13442295789718628, + "epoch": 2.68, + "learning_rate": 1.168179118903133e-06, + "loss": 0.1707, + "step": 7410, + "task_loss": 0.4021362066268921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16747888922691345, + "epoch": 2.68, + "learning_rate": 1.1663134742340648e-06, + "loss": 0.1748, + "step": 7420, + "task_loss": 0.2707682251930237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13281439244747162, + "epoch": 2.69, + "learning_rate": 1.164447234031447e-06, + "loss": 0.1679, + "step": 7430, + "task_loss": 0.4617811143398285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1636105477809906, + "epoch": 2.69, + "learning_rate": 1.1625804049778931e-06, + "loss": 0.1688, + "step": 7440, + "task_loss": 0.6173032522201538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18871071934700012, + "epoch": 2.69, + "learning_rate": 1.160712993758125e-06, + "loss": 0.1944, + "step": 7450, + "task_loss": 0.4313560724258423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.143826425075531, + "epoch": 2.7, + "learning_rate": 1.1588450070589492e-06, + "loss": 0.1692, + "step": 7460, + "task_loss": 0.496171236038208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12385374307632446, + "epoch": 2.7, + "learning_rate": 1.1569764515692334e-06, + "loss": 0.1574, + "step": 7470, + "task_loss": 0.521141529083252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1483931839466095, + "epoch": 2.7, + "learning_rate": 1.1551073339798803e-06, + "loss": 0.1631, + "step": 7480, + "task_loss": 0.30746322870254517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1397826075553894, + "epoch": 2.71, + "learning_rate": 1.1532376609838079e-06, + "loss": 0.1604, + "step": 7490, + "task_loss": 0.2872992753982544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18801283836364746, + "epoch": 2.71, + "learning_rate": 1.151367439275921e-06, + "loss": 0.1768, + "step": 7500, + "task_loss": 0.588277280330658 + }, + { + "epoch": 2.71, + "eval_exact_match": 83.66130558183538, + "eval_f1": 90.07695887230169, + "step": 7500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15823519229888916, + "epoch": 2.71, + "learning_rate": 1.1494966755530901e-06, + "loss": 0.1851, + "step": 7510, + "task_loss": 0.3757522404193878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14509174227714539, + "epoch": 2.72, + "learning_rate": 1.1476253765141267e-06, + "loss": 0.1717, + "step": 7520, + "task_loss": 0.15500980615615845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.147189199924469, + "epoch": 2.72, + "learning_rate": 1.1457535488597587e-06, + "loss": 0.1668, + "step": 7530, + "task_loss": 0.34534624218940735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14676904678344727, + "epoch": 2.72, + "learning_rate": 1.1438811992926067e-06, + "loss": 0.1817, + "step": 7540, + "task_loss": 0.49516406655311584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.195867121219635, + "epoch": 2.73, + "learning_rate": 1.1420083345171608e-06, + "loss": 0.1738, + "step": 7550, + "task_loss": 0.6467814445495605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11699579656124115, + "epoch": 2.73, + "learning_rate": 1.140134961239755e-06, + "loss": 0.1672, + "step": 7560, + "task_loss": 0.3284699022769928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14966784417629242, + "epoch": 2.74, + "learning_rate": 1.1382610861685456e-06, + "loss": 0.1768, + "step": 7570, + "task_loss": 0.33868739008903503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16117683053016663, + "epoch": 2.74, + "learning_rate": 1.1363867160134843e-06, + "loss": 0.1679, + "step": 7580, + "task_loss": 0.37554022669792175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1663319170475006, + "epoch": 2.74, + "learning_rate": 1.1345118574862967e-06, + "loss": 0.1682, + "step": 7590, + "task_loss": 0.5620455741882324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1660982072353363, + "epoch": 2.75, + "learning_rate": 1.1326365173004555e-06, + "loss": 0.1822, + "step": 7600, + "task_loss": 0.4505634307861328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13936099410057068, + "epoch": 2.75, + "learning_rate": 1.1307607021711606e-06, + "loss": 0.1681, + "step": 7610, + "task_loss": 0.3778277635574341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17037974298000336, + "epoch": 2.75, + "learning_rate": 1.12888441881531e-06, + "loss": 0.1794, + "step": 7620, + "task_loss": 0.3082660436630249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19312995672225952, + "epoch": 2.76, + "learning_rate": 1.1270076739514805e-06, + "loss": 0.1784, + "step": 7630, + "task_loss": 0.6790364980697632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14956381916999817, + "epoch": 2.76, + "learning_rate": 1.1251304742998999e-06, + "loss": 0.1646, + "step": 7640, + "task_loss": 0.3310818672180176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1720355749130249, + "epoch": 2.76, + "learning_rate": 1.1232528265824252e-06, + "loss": 0.1871, + "step": 7650, + "task_loss": 0.5396093130111694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12346095591783524, + "epoch": 2.77, + "learning_rate": 1.1213747375225178e-06, + "loss": 0.1782, + "step": 7660, + "task_loss": 0.5943026542663574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17838293313980103, + "epoch": 2.77, + "learning_rate": 1.1194962138452194e-06, + "loss": 0.1751, + "step": 7670, + "task_loss": 0.49117571115493774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15319928526878357, + "epoch": 2.78, + "learning_rate": 1.1176172622771276e-06, + "loss": 0.1701, + "step": 7680, + "task_loss": 0.2504529356956482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12942051887512207, + "epoch": 2.78, + "learning_rate": 1.115737889546373e-06, + "loss": 0.1704, + "step": 7690, + "task_loss": 0.8131101131439209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17500701546669006, + "epoch": 2.78, + "learning_rate": 1.1138581023825937e-06, + "loss": 0.1713, + "step": 7700, + "task_loss": 0.3743223249912262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15439695119857788, + "epoch": 2.79, + "learning_rate": 1.1119779075169117e-06, + "loss": 0.1639, + "step": 7710, + "task_loss": 0.5529880523681641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15999752283096313, + "epoch": 2.79, + "learning_rate": 1.1100973116819092e-06, + "loss": 0.1556, + "step": 7720, + "task_loss": 0.3388964533805847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18036921322345734, + "epoch": 2.79, + "learning_rate": 1.1082163216116044e-06, + "loss": 0.1663, + "step": 7730, + "task_loss": 0.5201153755187988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14596882462501526, + "epoch": 2.8, + "learning_rate": 1.1063349440414265e-06, + "loss": 0.1598, + "step": 7740, + "task_loss": 0.3204638361930847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1631544828414917, + "epoch": 2.8, + "learning_rate": 1.1044531857081927e-06, + "loss": 0.1804, + "step": 7750, + "task_loss": 0.8809808492660522 + }, + { + "epoch": 2.8, + "eval_exact_match": 83.52885525070955, + "eval_f1": 89.94545472443673, + "step": 7750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18084211647510529, + "epoch": 2.8, + "learning_rate": 1.1025710533500838e-06, + "loss": 0.1682, + "step": 7760, + "task_loss": 0.5090347528457642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1455087661743164, + "epoch": 2.81, + "learning_rate": 1.1006885537066194e-06, + "loss": 0.1893, + "step": 7770, + "task_loss": 0.32133185863494873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12667664885520935, + "epoch": 2.81, + "learning_rate": 1.0988056935186346e-06, + "loss": 0.1586, + "step": 7780, + "task_loss": 0.2582120895385742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16505160927772522, + "epoch": 2.82, + "learning_rate": 1.0969224795282556e-06, + "loss": 0.1773, + "step": 7790, + "task_loss": 0.6498008966445923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12269654124975204, + "epoch": 2.82, + "learning_rate": 1.0950389184788754e-06, + "loss": 0.1786, + "step": 7800, + "task_loss": 0.7851200103759766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15462878346443176, + "epoch": 2.82, + "learning_rate": 1.0931550171151295e-06, + "loss": 0.1717, + "step": 7810, + "task_loss": 0.45008930563926697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17051701247692108, + "epoch": 2.83, + "learning_rate": 1.0912707821828724e-06, + "loss": 0.1914, + "step": 7820, + "task_loss": 0.40274274349212646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14285723865032196, + "epoch": 2.83, + "learning_rate": 1.089386220429153e-06, + "loss": 0.1757, + "step": 7830, + "task_loss": 0.513251543045044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19114567339420319, + "epoch": 2.83, + "learning_rate": 1.0875013386021893e-06, + "loss": 0.1888, + "step": 7840, + "task_loss": 0.5363246202468872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1428094506263733, + "epoch": 2.84, + "learning_rate": 1.0856161434513475e-06, + "loss": 0.1713, + "step": 7850, + "task_loss": 0.4419270157814026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15643715858459473, + "epoch": 2.84, + "learning_rate": 1.0837306417271147e-06, + "loss": 0.1681, + "step": 7860, + "task_loss": 0.44533056020736694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13387146592140198, + "epoch": 2.84, + "learning_rate": 1.0818448401810753e-06, + "loss": 0.1737, + "step": 7870, + "task_loss": 0.4950907230377197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15607303380966187, + "epoch": 2.85, + "learning_rate": 1.079958745565888e-06, + "loss": 0.1749, + "step": 7880, + "task_loss": 0.6860287189483643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1685732752084732, + "epoch": 2.85, + "learning_rate": 1.0780723646352605e-06, + "loss": 0.1731, + "step": 7890, + "task_loss": 0.5070062875747681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15234628319740295, + "epoch": 2.86, + "learning_rate": 1.076185704143926e-06, + "loss": 0.1662, + "step": 7900, + "task_loss": 0.32623738050460815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14870676398277283, + "epoch": 2.86, + "learning_rate": 1.0742987708476185e-06, + "loss": 0.1779, + "step": 7910, + "task_loss": 0.43853697180747986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11807222664356232, + "epoch": 2.86, + "learning_rate": 1.0724115715030495e-06, + "loss": 0.1545, + "step": 7920, + "task_loss": 0.3483704626560211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1611451804637909, + "epoch": 2.87, + "learning_rate": 1.0705241128678824e-06, + "loss": 0.1668, + "step": 7930, + "task_loss": 0.4193282723426819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15785019099712372, + "epoch": 2.87, + "learning_rate": 1.0686364017007093e-06, + "loss": 0.1789, + "step": 7940, + "task_loss": 0.315855473279953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19590085744857788, + "epoch": 2.87, + "learning_rate": 1.0667484447610261e-06, + "loss": 0.1817, + "step": 7950, + "task_loss": 0.45036888122558594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1536681354045868, + "epoch": 2.88, + "learning_rate": 1.0648602488092104e-06, + "loss": 0.1783, + "step": 7960, + "task_loss": 0.39551639556884766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12338414788246155, + "epoch": 2.88, + "learning_rate": 1.0629718206064935e-06, + "loss": 0.151, + "step": 7970, + "task_loss": 0.35970136523246765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12747308611869812, + "epoch": 2.88, + "learning_rate": 1.06108316691494e-06, + "loss": 0.1743, + "step": 7980, + "task_loss": 0.3183417320251465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15406057238578796, + "epoch": 2.89, + "learning_rate": 1.0591942944974212e-06, + "loss": 0.1725, + "step": 7990, + "task_loss": 0.7116891145706177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1521359086036682, + "epoch": 2.89, + "learning_rate": 1.0573052101175915e-06, + "loss": 0.1804, + "step": 8000, + "task_loss": 0.45792704820632935 + }, + { + "epoch": 2.89, + "eval_exact_match": 83.81267738883633, + "eval_f1": 90.15074155517108, + "step": 8000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19931340217590332, + "epoch": 2.89, + "learning_rate": 1.0554159205398643e-06, + "loss": 0.1716, + "step": 8010, + "task_loss": 0.7359171509742737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1555495262145996, + "epoch": 2.9, + "learning_rate": 1.0535264325293885e-06, + "loss": 0.1763, + "step": 8020, + "task_loss": 0.5659838914871216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18331894278526306, + "epoch": 2.9, + "learning_rate": 1.0516367528520227e-06, + "loss": 0.1768, + "step": 8030, + "task_loss": 0.6728960871696472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19219288229942322, + "epoch": 2.91, + "learning_rate": 1.0497468882743122e-06, + "loss": 0.1806, + "step": 8040, + "task_loss": 0.504513144493103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12473896145820618, + "epoch": 2.91, + "learning_rate": 1.0478568455634641e-06, + "loss": 0.1709, + "step": 8050, + "task_loss": 0.22338946163654327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15949636697769165, + "epoch": 2.91, + "learning_rate": 1.045966631487324e-06, + "loss": 0.1682, + "step": 8060, + "task_loss": 0.4108890891075134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16450482606887817, + "epoch": 2.92, + "learning_rate": 1.0440762528143505e-06, + "loss": 0.1658, + "step": 8070, + "task_loss": 0.38757970929145813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14186137914657593, + "epoch": 2.92, + "learning_rate": 1.042185716313592e-06, + "loss": 0.1741, + "step": 8080, + "task_loss": 0.3712804913520813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16010576486587524, + "epoch": 2.92, + "learning_rate": 1.040295028754661e-06, + "loss": 0.1657, + "step": 8090, + "task_loss": 0.2653946876525879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1634744256734848, + "epoch": 2.93, + "learning_rate": 1.0384041969077125e-06, + "loss": 0.1678, + "step": 8100, + "task_loss": 0.37681901454925537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17398503422737122, + "epoch": 2.93, + "learning_rate": 1.0365132275434175e-06, + "loss": 0.1694, + "step": 8110, + "task_loss": 0.5321211814880371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14301329851150513, + "epoch": 2.93, + "learning_rate": 1.0346221274329392e-06, + "loss": 0.1627, + "step": 8120, + "task_loss": 0.2358737289905548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19131416082382202, + "epoch": 2.94, + "learning_rate": 1.0327309033479087e-06, + "loss": 0.1897, + "step": 8130, + "task_loss": 0.35499224066734314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1476271152496338, + "epoch": 2.94, + "learning_rate": 1.0308395620604016e-06, + "loss": 0.1682, + "step": 8140, + "task_loss": 0.42450767755508423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17429782450199127, + "epoch": 2.95, + "learning_rate": 1.0289481103429135e-06, + "loss": 0.1829, + "step": 8150, + "task_loss": 0.4377296566963196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16124795377254486, + "epoch": 2.95, + "learning_rate": 1.0270565549683342e-06, + "loss": 0.1786, + "step": 8160, + "task_loss": 0.3876572549343109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15951263904571533, + "epoch": 2.95, + "learning_rate": 1.0251649027099262e-06, + "loss": 0.1673, + "step": 8170, + "task_loss": 0.4029073119163513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1840726137161255, + "epoch": 2.96, + "learning_rate": 1.0232731603412972e-06, + "loss": 0.1927, + "step": 8180, + "task_loss": 0.4143099784851074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18531106412410736, + "epoch": 2.96, + "learning_rate": 1.0213813346363792e-06, + "loss": 0.1768, + "step": 8190, + "task_loss": 0.6813019514083862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18536627292633057, + "epoch": 2.96, + "learning_rate": 1.0194894323694014e-06, + "loss": 0.176, + "step": 8200, + "task_loss": 0.5788917541503906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1379977911710739, + "epoch": 2.97, + "learning_rate": 1.0175974603148683e-06, + "loss": 0.1811, + "step": 8210, + "task_loss": 0.8245267271995544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14147508144378662, + "epoch": 2.97, + "learning_rate": 1.0157054252475335e-06, + "loss": 0.1668, + "step": 8220, + "task_loss": 0.48868924379348755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17832288146018982, + "epoch": 2.97, + "learning_rate": 1.0138133339423757e-06, + "loss": 0.1748, + "step": 8230, + "task_loss": 0.48949164152145386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15996500849723816, + "epoch": 2.98, + "learning_rate": 1.0119211931745766e-06, + "loss": 0.1719, + "step": 8240, + "task_loss": 0.4312325716018677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15269511938095093, + "epoch": 2.98, + "learning_rate": 1.0100290097194932e-06, + "loss": 0.1666, + "step": 8250, + "task_loss": 0.543391227722168 + }, + { + "epoch": 2.98, + "eval_exact_match": 83.62346263008514, + "eval_f1": 90.03105862224157, + "step": 8250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16537584364414215, + "epoch": 2.99, + "learning_rate": 1.0081367903526367e-06, + "loss": 0.1708, + "step": 8260, + "task_loss": 0.23162506520748138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1359395980834961, + "epoch": 2.99, + "learning_rate": 1.0062445418496466e-06, + "loss": 0.1698, + "step": 8270, + "task_loss": 0.4973392188549042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18227165937423706, + "epoch": 2.99, + "learning_rate": 1.0043522709862663e-06, + "loss": 0.1872, + "step": 8280, + "task_loss": 0.4104633331298828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1791878640651703, + "epoch": 3.0, + "learning_rate": 1.0024599845383195e-06, + "loss": 0.1789, + "step": 8290, + "task_loss": 0.4813977777957916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15984641015529633, + "epoch": 3.0, + "learning_rate": 1.0005676892816859e-06, + "loss": 0.1717, + "step": 8300, + "task_loss": 0.5337830781936646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 8.050780939351022e-05, + "compression/movement_sparsity/importance_threshold": -0.8842994279815715, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18141396343708038, + "epoch": 3.0, + "learning_rate": 9.986753919922763e-07, + "loss": 0.1893, + "step": 8310, + "task_loss": 0.4816855788230896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00018104905245944604, + "compression/movement_sparsity/importance_threshold": -0.8829299076613767, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17543792724609375, + "epoch": 3.01, + "learning_rate": 9.967830994460091e-07, + "loss": 0.1674, + "step": 8320, + "task_loss": 0.7297599911689758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0002814864359787489, + "compression/movement_sparsity/importance_threshold": -0.8815618020617008, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14357587695121765, + "epoch": 3.01, + "learning_rate": 9.94890818418786e-07, + "loss": 0.1749, + "step": 8330, + "task_loss": 0.4249134957790375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00038182001362283225, + "compression/movement_sparsity/importance_threshold": -0.8801951104514598, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15991082787513733, + "epoch": 3.01, + "learning_rate": 9.929985556864669e-07, + "loss": 0.162, + "step": 8340, + "task_loss": 0.3581671714782715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00048204983906308785, + "compression/movement_sparsity/importance_threshold": -0.87882983209957, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16143129765987396, + "epoch": 3.02, + "learning_rate": 9.911063180248462e-07, + "loss": 0.1693, + "step": 8350, + "task_loss": 0.6364099383354187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0005821759659709508, + "compression/movement_sparsity/importance_threshold": -0.8774659662749472, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14541073143482208, + "epoch": 3.02, + "learning_rate": 9.89214112209629e-07, + "loss": 0.1719, + "step": 8360, + "task_loss": 0.2846934497356415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0006821984480178128, + "compression/movement_sparsity/importance_threshold": -0.8761035122465077, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14429126679897308, + "epoch": 3.02, + "learning_rate": 9.873219450164061e-07, + "loss": 0.1838, + "step": 8370, + "task_loss": 0.33501607179641724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0007821173388750946, + "compression/movement_sparsity/importance_threshold": -0.8747424692831676, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19137001037597656, + "epoch": 3.03, + "learning_rate": 9.854298232206296e-07, + "loss": 0.1799, + "step": 8380, + "task_loss": 0.4200936555862427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0008819326922142095, + "compression/movement_sparsity/importance_threshold": -0.8733828366538425, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18857447803020477, + "epoch": 3.03, + "learning_rate": 9.835377535975905e-07, + "loss": 0.1761, + "step": 8390, + "task_loss": 0.4611113667488098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0009816445617065568, + "compression/movement_sparsity/importance_threshold": -0.872024613627449, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.139084130525589, + "epoch": 3.04, + "learning_rate": 9.816457429223905e-07, + "loss": 0.1636, + "step": 8400, + "task_loss": 0.5555346012115479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0010812530010235349, + "compression/movement_sparsity/importance_threshold": -0.8706677994729032, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12424302101135254, + "epoch": 3.04, + "learning_rate": 9.797537979699225e-07, + "loss": 0.1682, + "step": 8410, + "task_loss": 0.3569965064525604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0011807580638365867, + "compression/movement_sparsity/importance_threshold": -0.8693123934591207, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13371171057224274, + "epoch": 3.04, + "learning_rate": 9.778619255148434e-07, + "loss": 0.1677, + "step": 8420, + "task_loss": 0.4002540707588196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.001280159803817118, + "compression/movement_sparsity/importance_threshold": -0.8679583948550178, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15741753578186035, + "epoch": 3.05, + "learning_rate": 9.759701323315496e-07, + "loss": 0.1722, + "step": 8430, + "task_loss": 0.4664173722267151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.001379458274636528, + "compression/movement_sparsity/importance_threshold": -0.8666058029295104, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16380411386489868, + "epoch": 3.05, + "learning_rate": 9.740784251941549e-07, + "loss": 0.1842, + "step": 8440, + "task_loss": 0.3489740490913391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.001478653529966223, + "compression/movement_sparsity/importance_threshold": -0.865254616951515, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2031557559967041, + "epoch": 3.05, + "learning_rate": 9.721868108764637e-07, + "loss": 0.1717, + "step": 8450, + "task_loss": 0.8436130881309509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0015777456234776306, + "compression/movement_sparsity/importance_threshold": -0.8639048361899471, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19330856204032898, + "epoch": 3.06, + "learning_rate": 9.702952961519502e-07, + "loss": 0.1723, + "step": 8460, + "task_loss": 0.9914184808731079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0016767346088421425, + "compression/movement_sparsity/importance_threshold": -0.8625564599137232, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16919749975204468, + "epoch": 3.06, + "learning_rate": 9.68403887793729e-07, + "loss": 0.1642, + "step": 8470, + "task_loss": 0.4527369737625122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0017756205397311798, + "compression/movement_sparsity/importance_threshold": -0.8612094873917593, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16218455135822296, + "epoch": 3.06, + "learning_rate": 9.66512592574536e-07, + "loss": 0.1842, + "step": 8480, + "task_loss": 0.23674465715885162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0018744034698161484, + "compression/movement_sparsity/importance_threshold": -0.8598639178929713, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14123575389385223, + "epoch": 3.07, + "learning_rate": 9.646214172667018e-07, + "loss": 0.1799, + "step": 8490, + "task_loss": 0.442488431930542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.001973083452768469, + "compression/movement_sparsity/importance_threshold": -0.8585197506862754, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13976861536502838, + "epoch": 3.07, + "learning_rate": 9.627303686421263e-07, + "loss": 0.17, + "step": 8500, + "task_loss": 0.23940631747245789 + }, + { + "epoch": 3.07, + "eval_exact_match": 83.62346263008514, + "eval_f1": 90.07971472160804, + "step": 8500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.002071660542259533, + "compression/movement_sparsity/importance_threshold": -0.8571769850405877, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15648941695690155, + "epoch": 3.08, + "learning_rate": 9.608394534722578e-07, + "loss": 0.1894, + "step": 8510, + "task_loss": 0.293619304895401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.002170134791960769, + "compression/movement_sparsity/importance_threshold": -0.8558356202248242, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16870902478694916, + "epoch": 3.08, + "learning_rate": 9.58948678528064e-07, + "loss": 0.1768, + "step": 8520, + "task_loss": 0.6857977509498596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.002268506255543583, + "compression/movement_sparsity/importance_threshold": -0.8544956555079009, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14667314291000366, + "epoch": 3.08, + "learning_rate": 9.570580505800134e-07, + "loss": 0.1685, + "step": 8530, + "task_loss": 0.3974134922027588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0023667749866793804, + "compression/movement_sparsity/importance_threshold": -0.8531570901587339, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2171468436717987, + "epoch": 3.09, + "learning_rate": 9.551675763980463e-07, + "loss": 0.1824, + "step": 8540, + "task_loss": 0.7033668160438538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.002464941039039561, + "compression/movement_sparsity/importance_threshold": -0.8518199234462395, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15260061621665955, + "epoch": 3.09, + "learning_rate": 9.532772627515527e-07, + "loss": 0.1665, + "step": 8550, + "task_loss": 0.22714334726333618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0025630044662955597, + "compression/movement_sparsity/importance_threshold": -0.8504841546393335, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14580965042114258, + "epoch": 3.09, + "learning_rate": 9.513871164093483e-07, + "loss": 0.1718, + "step": 8560, + "task_loss": 0.3902367949485779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.002660965322118783, + "compression/movement_sparsity/importance_threshold": -0.8491497830069319, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22626420855522156, + "epoch": 3.1, + "learning_rate": 9.494971441396488e-07, + "loss": 0.1679, + "step": 8570, + "task_loss": 0.6860054731369019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.002758823660180637, + "compression/movement_sparsity/importance_threshold": -0.8478168078179509, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13325051963329315, + "epoch": 3.1, + "learning_rate": 9.476073527100477e-07, + "loss": 0.1708, + "step": 8580, + "task_loss": 0.5953207612037659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.002856579534152527, + "compression/movement_sparsity/importance_threshold": -0.8464852283413066, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14980527758598328, + "epoch": 3.1, + "learning_rate": 9.457177488874907e-07, + "loss": 0.175, + "step": 8590, + "task_loss": 0.5516312122344971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0029542329977058674, + "compression/movement_sparsity/importance_threshold": -0.845155043845915, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14336633682250977, + "epoch": 3.11, + "learning_rate": 9.438283394382505e-07, + "loss": 0.1727, + "step": 8600, + "task_loss": 0.43941259384155273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0030517841045120714, + "compression/movement_sparsity/importance_threshold": -0.8438262536006922, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.138885036110878, + "epoch": 3.11, + "learning_rate": 9.419391311279053e-07, + "loss": 0.1608, + "step": 8610, + "task_loss": 0.4903583824634552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0031492329082425455, + "compression/movement_sparsity/importance_threshold": -0.8424988568745543, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14071306586265564, + "epoch": 3.12, + "learning_rate": 9.40050130721312e-07, + "loss": 0.1671, + "step": 8620, + "task_loss": 0.4048214554786682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0032465794625687024, + "compression/movement_sparsity/importance_threshold": -0.8411728529364173, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16939450800418854, + "epoch": 3.12, + "learning_rate": 9.381613449825843e-07, + "loss": 0.1777, + "step": 8630, + "task_loss": 0.5866584777832031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.003343823821161949, + "compression/movement_sparsity/importance_threshold": -0.8398482410551974, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11506274342536926, + "epoch": 3.12, + "learning_rate": 9.362727806750654e-07, + "loss": 0.1697, + "step": 8640, + "task_loss": 0.45650506019592285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.003440966037693698, + "compression/movement_sparsity/importance_threshold": -0.8385250204998105, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18618786334991455, + "epoch": 3.13, + "learning_rate": 9.343844445613072e-07, + "loss": 0.1852, + "step": 8650, + "task_loss": 0.341508150100708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0035380061658353635, + "compression/movement_sparsity/importance_threshold": -0.8372031905391728, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15694350004196167, + "epoch": 3.13, + "learning_rate": 9.324963434030442e-07, + "loss": 0.1587, + "step": 8660, + "task_loss": 0.6575144529342651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.003634944259258359, + "compression/movement_sparsity/importance_threshold": -0.8358827504422001, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1654762327671051, + "epoch": 3.13, + "learning_rate": 9.306084839611687e-07, + "loss": 0.1746, + "step": 8670, + "task_loss": 0.257068395614624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0037317803716340825, + "compression/movement_sparsity/importance_threshold": -0.8345636994778088, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1543157696723938, + "epoch": 3.14, + "learning_rate": 9.287208729957085e-07, + "loss": 0.1791, + "step": 8680, + "task_loss": 0.6083182096481323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.003828514556633934, + "compression/movement_sparsity/importance_threshold": -0.8332460369149152, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15461325645446777, + "epoch": 3.14, + "learning_rate": 9.268335172658008e-07, + "loss": 0.1743, + "step": 8690, + "task_loss": 0.3445891737937927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.003925146867929356, + "compression/movement_sparsity/importance_threshold": -0.8319297620224346, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14635170996189117, + "epoch": 3.14, + "learning_rate": 9.249464235296695e-07, + "loss": 0.1698, + "step": 8700, + "task_loss": 0.3346456289291382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004021677359191753, + "compression/movement_sparsity/importance_threshold": -0.8306148740692835, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17073814570903778, + "epoch": 3.15, + "learning_rate": 9.230595985446003e-07, + "loss": 0.1737, + "step": 8710, + "task_loss": 0.5876675844192505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004118106084092519, + "compression/movement_sparsity/importance_threshold": -0.8293013723243781, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20534959435462952, + "epoch": 3.15, + "learning_rate": 9.21173049066916e-07, + "loss": 0.1693, + "step": 8720, + "task_loss": 0.382878839969635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00421443309630308, + "compression/movement_sparsity/importance_threshold": -0.827989256056634, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12774941325187683, + "epoch": 3.16, + "learning_rate": 9.192867818519535e-07, + "loss": 0.1775, + "step": 8730, + "task_loss": 0.19287356734275818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0043106584494948365, + "compression/movement_sparsity/importance_threshold": -0.8266785245349677, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1298234462738037, + "epoch": 3.16, + "learning_rate": 9.174008036540384e-07, + "loss": 0.1682, + "step": 8740, + "task_loss": 0.2842978239059448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004406782197339201, + "compression/movement_sparsity/importance_threshold": -0.8253691770282953, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1581944227218628, + "epoch": 3.16, + "learning_rate": 9.155151212264625e-07, + "loss": 0.1603, + "step": 8750, + "task_loss": 0.26196685433387756 + }, + { + "epoch": 3.16, + "eval_exact_match": 83.49101229895932, + "eval_f1": 89.91894198656578, + "step": 8750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004502804393507587, + "compression/movement_sparsity/importance_threshold": -0.8240612128055325, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13445620238780975, + "epoch": 3.17, + "learning_rate": 9.136297413214566e-07, + "loss": 0.1729, + "step": 8760, + "task_loss": 0.1746811419725418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004598725091671408, + "compression/movement_sparsity/importance_threshold": -0.8227546311355955, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12342646718025208, + "epoch": 3.17, + "learning_rate": 9.1174467069017e-07, + "loss": 0.1721, + "step": 8770, + "task_loss": 0.33239758014678955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004694544345502063, + "compression/movement_sparsity/importance_threshold": -0.8214494312874007, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15459150075912476, + "epoch": 3.17, + "learning_rate": 9.098599160826441e-07, + "loss": 0.1722, + "step": 8780, + "task_loss": 0.3906312584877014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004790262208670973, + "compression/movement_sparsity/importance_threshold": -0.8201456125298637, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16902866959571838, + "epoch": 3.18, + "learning_rate": 9.079754842477879e-07, + "loss": 0.1756, + "step": 8790, + "task_loss": 0.3516932725906372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004885878734849544, + "compression/movement_sparsity/importance_threshold": -0.8188431741319009, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1674996316432953, + "epoch": 3.18, + "learning_rate": 9.060913819333559e-07, + "loss": 0.1744, + "step": 8800, + "task_loss": 0.5219091176986694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0049813939777091885, + "compression/movement_sparsity/importance_threshold": -0.8175421153624282, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14921870827674866, + "epoch": 3.18, + "learning_rate": 9.042076158859214e-07, + "loss": 0.1648, + "step": 8810, + "task_loss": 0.5738848447799683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005076807990921315, + "compression/movement_sparsity/importance_threshold": -0.8162424354903617, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14361397922039032, + "epoch": 3.19, + "learning_rate": 9.023241928508542e-07, + "loss": 0.1649, + "step": 8820, + "task_loss": 0.5719636678695679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005172120828157334, + "compression/movement_sparsity/importance_threshold": -0.8149441337846175, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1588568091392517, + "epoch": 3.19, + "learning_rate": 9.004411195722965e-07, + "loss": 0.1811, + "step": 8830, + "task_loss": 0.39752593636512756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00526733254308864, + "compression/movement_sparsity/importance_threshold": -0.8136472095141118, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2179379165172577, + "epoch": 3.19, + "learning_rate": 8.985584027931364e-07, + "loss": 0.1771, + "step": 8840, + "task_loss": 0.455571711063385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00536244318938668, + "compression/movement_sparsity/importance_threshold": -0.8123516619477603, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15764448046684265, + "epoch": 3.2, + "learning_rate": 8.966760492549872e-07, + "loss": 0.1683, + "step": 8850, + "task_loss": 0.5573818683624268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005457452820722847, + "compression/movement_sparsity/importance_threshold": -0.8110574903544794, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15336552262306213, + "epoch": 3.2, + "learning_rate": 8.947940656981603e-07, + "loss": 0.1729, + "step": 8860, + "task_loss": 0.30682915449142456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005552361490768549, + "compression/movement_sparsity/importance_threshold": -0.8097646940031848, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15846428275108337, + "epoch": 3.21, + "learning_rate": 8.929124588616429e-07, + "loss": 0.1706, + "step": 8870, + "task_loss": 0.5824536681175232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005647169253195196, + "compression/movement_sparsity/importance_threshold": -0.8084732721627931, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13658425211906433, + "epoch": 3.21, + "learning_rate": 8.910312354830736e-07, + "loss": 0.1763, + "step": 8880, + "task_loss": 0.7418199777603149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0057418761616742035, + "compression/movement_sparsity/importance_threshold": -0.8071832241022199, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21933336555957794, + "epoch": 3.21, + "learning_rate": 8.891504022987165e-07, + "loss": 0.1868, + "step": 8890, + "task_loss": 0.48175936937332153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005836482269876977, + "compression/movement_sparsity/importance_threshold": -0.8058945490903814, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12823690474033356, + "epoch": 3.22, + "learning_rate": 8.8726996604344e-07, + "loss": 0.1711, + "step": 8900, + "task_loss": 0.35165947675704956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005930987631474931, + "compression/movement_sparsity/importance_threshold": -0.8046072463961937, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1561659872531891, + "epoch": 3.22, + "learning_rate": 8.853899334506904e-07, + "loss": 0.1773, + "step": 8910, + "task_loss": 0.23708105087280273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00602539230013947, + "compression/movement_sparsity/importance_threshold": -0.803321315288573, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1587158590555191, + "epoch": 3.22, + "learning_rate": 8.835103112524691e-07, + "loss": 0.1701, + "step": 8920, + "task_loss": 0.3867292106151581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006119696329542016, + "compression/movement_sparsity/importance_threshold": -0.8020367550364351, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13717815279960632, + "epoch": 3.23, + "learning_rate": 8.816311061793068e-07, + "loss": 0.1711, + "step": 8930, + "task_loss": 0.31392791867256165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006213899773353968, + "compression/movement_sparsity/importance_threshold": -0.8007535649086963, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16188368201255798, + "epoch": 3.23, + "learning_rate": 8.79752324960242e-07, + "loss": 0.1637, + "step": 8940, + "task_loss": 0.5714429616928101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006308002685246738, + "compression/movement_sparsity/importance_threshold": -0.7994717441742726, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1368107795715332, + "epoch": 3.23, + "learning_rate": 8.778739743227951e-07, + "loss": 0.179, + "step": 8950, + "task_loss": 0.36976689100265503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006402005118891741, + "compression/movement_sparsity/importance_threshold": -0.7981912921020801, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1518588364124298, + "epoch": 3.24, + "learning_rate": 8.759960609929435e-07, + "loss": 0.1639, + "step": 8960, + "task_loss": 0.6866006255149841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00649590712796039, + "compression/movement_sparsity/importance_threshold": -0.7969122079610346, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20307350158691406, + "epoch": 3.24, + "learning_rate": 8.741185916951006e-07, + "loss": 0.1676, + "step": 8970, + "task_loss": 0.28236985206604004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006589708766124069, + "compression/movement_sparsity/importance_threshold": -0.7956344910200528, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16698408126831055, + "epoch": 3.25, + "learning_rate": 8.72241573152088e-07, + "loss": 0.1652, + "step": 8980, + "task_loss": 0.30868786573410034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006683410087054235, + "compression/movement_sparsity/importance_threshold": -0.79435814054805, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15856650471687317, + "epoch": 3.25, + "learning_rate": 8.703650120851146e-07, + "loss": 0.1839, + "step": 8990, + "task_loss": 0.16748939454555511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006777011144422273, + "compression/movement_sparsity/importance_threshold": -0.7930831558139426, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15102289617061615, + "epoch": 3.25, + "learning_rate": 8.684889152137508e-07, + "loss": 0.1715, + "step": 9000, + "task_loss": 0.4023663103580475 + }, + { + "epoch": 3.25, + "eval_exact_match": 83.50047303689688, + "eval_f1": 89.90574516831549, + "step": 9000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006870511991899589, + "compression/movement_sparsity/importance_threshold": -0.7918095360866468, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12239585816860199, + "epoch": 3.26, + "learning_rate": 8.66613289255904e-07, + "loss": 0.1694, + "step": 9010, + "task_loss": 0.3852020502090454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00696391268315761, + "compression/movement_sparsity/importance_threshold": -0.7905372806350783, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17271217703819275, + "epoch": 3.26, + "learning_rate": 8.647381409277966e-07, + "loss": 0.1749, + "step": 9020, + "task_loss": 0.48539960384368896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007057213271867729, + "compression/movement_sparsity/importance_threshold": -0.7892663887281537, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17906677722930908, + "epoch": 3.26, + "learning_rate": 8.628634769439398e-07, + "loss": 0.172, + "step": 9030, + "task_loss": 0.44566771388053894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007150413811701367, + "compression/movement_sparsity/importance_threshold": -0.7879968596347886, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18354880809783936, + "epoch": 3.27, + "learning_rate": 8.609893040171112e-07, + "loss": 0.1751, + "step": 9040, + "task_loss": 0.3899494707584381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0072435143563299356, + "compression/movement_sparsity/importance_threshold": -0.7867286926238993, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17823264002799988, + "epoch": 3.27, + "learning_rate": 8.59115628858329e-07, + "loss": 0.1753, + "step": 9050, + "task_loss": 0.3264240622520447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007336514959424835, + "compression/movement_sparsity/importance_threshold": -0.7854618869644019, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16943280398845673, + "epoch": 3.27, + "learning_rate": 8.5724245817683e-07, + "loss": 0.1781, + "step": 9060, + "task_loss": 0.31149184703826904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007429415674657485, + "compression/movement_sparsity/importance_threshold": -0.7841964419252122, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13426220417022705, + "epoch": 3.28, + "learning_rate": 8.553697986800444e-07, + "loss": 0.1806, + "step": 9070, + "task_loss": 0.3615609109401703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007522216555699301, + "compression/movement_sparsity/importance_threshold": -0.7829323567752465, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14723095297813416, + "epoch": 3.28, + "learning_rate": 8.534976570735711e-07, + "loss": 0.1751, + "step": 9080, + "task_loss": 0.2963823676109314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007614917656221679, + "compression/movement_sparsity/importance_threshold": -0.7816696307834208, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1296226531267166, + "epoch": 3.29, + "learning_rate": 8.516260400611559e-07, + "loss": 0.1796, + "step": 9090, + "task_loss": 0.4440094828605652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007707519029896036, + "compression/movement_sparsity/importance_threshold": -0.7804082632186513, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1356373429298401, + "epoch": 3.29, + "learning_rate": 8.497549543446649e-07, + "loss": 0.1669, + "step": 9100, + "task_loss": 0.3411458134651184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0078000207303937814, + "compression/movement_sparsity/importance_threshold": -0.7791482533498539, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15570493042469025, + "epoch": 3.29, + "learning_rate": 8.478844066240624e-07, + "loss": 0.1928, + "step": 9110, + "task_loss": 0.35819387435913086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007892422811386331, + "compression/movement_sparsity/importance_threshold": -0.7778896004459447, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16601881384849548, + "epoch": 3.3, + "learning_rate": 8.460144035973866e-07, + "loss": 0.1692, + "step": 9120, + "task_loss": 0.5100635290145874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007984725326545077, + "compression/movement_sparsity/importance_threshold": -0.7766323037758399, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17528662085533142, + "epoch": 3.3, + "learning_rate": 8.44144951960724e-07, + "loss": 0.1646, + "step": 9130, + "task_loss": 0.4419068396091461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00807692832954146, + "compression/movement_sparsity/importance_threshold": -0.7753763626084553, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13393300771713257, + "epoch": 3.3, + "learning_rate": 8.422760584081881e-07, + "loss": 0.1774, + "step": 9140, + "task_loss": 0.16897660493850708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008169031874046875, + "compression/movement_sparsity/importance_threshold": -0.7741217762127072, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13365423679351807, + "epoch": 3.31, + "learning_rate": 8.404077296318928e-07, + "loss": 0.1632, + "step": 9150, + "task_loss": 0.30446919798851013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008261036013732738, + "compression/movement_sparsity/importance_threshold": -0.7728685438575115, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2315337210893631, + "epoch": 3.31, + "learning_rate": 8.385399723219313e-07, + "loss": 0.191, + "step": 9160, + "task_loss": 0.4761585593223572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008352940802270453, + "compression/movement_sparsity/importance_threshold": -0.7716166648117844, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16882847249507904, + "epoch": 3.31, + "learning_rate": 8.366727931663481e-07, + "loss": 0.1695, + "step": 9170, + "task_loss": 0.2730327248573303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008444746293331428, + "compression/movement_sparsity/importance_threshold": -0.7703661383444418, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1654556542634964, + "epoch": 3.32, + "learning_rate": 8.348061988511194e-07, + "loss": 0.1676, + "step": 9180, + "task_loss": 0.7155327796936035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00853645254058708, + "compression/movement_sparsity/importance_threshold": -0.7691169637244, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20778971910476685, + "epoch": 3.32, + "learning_rate": 8.329401960601273e-07, + "loss": 0.1829, + "step": 9190, + "task_loss": 0.5636166930198669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008628059597708822, + "compression/movement_sparsity/importance_threshold": -0.7678691402205748, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16762343049049377, + "epoch": 3.32, + "learning_rate": 8.310747914751339e-07, + "loss": 0.1736, + "step": 9200, + "task_loss": 0.32610780000686646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008719567518368061, + "compression/movement_sparsity/importance_threshold": -0.7666226671018825, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1574648916721344, + "epoch": 3.33, + "learning_rate": 8.292099917757612e-07, + "loss": 0.1756, + "step": 9210, + "task_loss": 0.43395519256591797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0088109763562362, + "compression/movement_sparsity/importance_threshold": -0.7653775436372391, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15642675757408142, + "epoch": 3.33, + "learning_rate": 8.273458036394641e-07, + "loss": 0.1819, + "step": 9220, + "task_loss": 0.3005210757255554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008902286164984664, + "compression/movement_sparsity/importance_threshold": -0.7641337690955606, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15306097269058228, + "epoch": 3.34, + "learning_rate": 8.254822337415079e-07, + "loss": 0.1736, + "step": 9230, + "task_loss": 0.3201584815979004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008993496998284847, + "compression/movement_sparsity/importance_threshold": -0.7628913427457633, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11628354340791702, + "epoch": 3.34, + "learning_rate": 8.23619288754945e-07, + "loss": 0.1631, + "step": 9240, + "task_loss": 0.29428863525390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009084608909808174, + "compression/movement_sparsity/importance_threshold": -0.7616502638567629, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13086993992328644, + "epoch": 3.34, + "learning_rate": 8.217569753505883e-07, + "loss": 0.1668, + "step": 9250, + "task_loss": 0.3168758749961853 + }, + { + "epoch": 3.34, + "eval_exact_match": 83.52885525070955, + "eval_f1": 89.96188168895662, + "step": 9250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009175621953226057, + "compression/movement_sparsity/importance_threshold": -0.7604105316974756, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1549527943134308, + "epoch": 3.35, + "learning_rate": 8.198953001969908e-07, + "loss": 0.1774, + "step": 9260, + "task_loss": 0.35443389415740967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009266536182209873, + "compression/movement_sparsity/importance_threshold": -0.7591721455368179, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11498824506998062, + "epoch": 3.35, + "learning_rate": 8.180342699604192e-07, + "loss": 0.1719, + "step": 9270, + "task_loss": 0.29143670201301575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009357351650431077, + "compression/movement_sparsity/importance_threshold": -0.7579351046437052, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1407342255115509, + "epoch": 3.35, + "learning_rate": 8.161738913048309e-07, + "loss": 0.1843, + "step": 9280, + "task_loss": 0.46949753165245056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009448068411561065, + "compression/movement_sparsity/importance_threshold": -0.7566994082870538, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1577707827091217, + "epoch": 3.36, + "learning_rate": 8.145001131224242e-07, + "loss": 0.1688, + "step": 9290, + "task_loss": 0.2751985192298889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009538686519271247, + "compression/movement_sparsity/importance_threshold": -0.7554650557357798, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14712263643741608, + "epoch": 3.36, + "learning_rate": 8.126409908215325e-07, + "loss": 0.1608, + "step": 9300, + "task_loss": 0.28362199664115906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00962920602723303, + "compression/movement_sparsity/importance_threshold": -0.7542320462587994, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17634797096252441, + "epoch": 3.36, + "learning_rate": 8.107825394138224e-07, + "loss": 0.1866, + "step": 9310, + "task_loss": 0.7741411924362183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009719626989117821, + "compression/movement_sparsity/importance_threshold": -0.7530003791250285, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1491977721452713, + "epoch": 3.37, + "learning_rate": 8.089247655540163e-07, + "loss": 0.175, + "step": 9320, + "task_loss": 0.3922104239463806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009809949458597039, + "compression/movement_sparsity/importance_threshold": -0.7517700536033832, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16116444766521454, + "epoch": 3.37, + "learning_rate": 8.070676758944122e-07, + "loss": 0.1655, + "step": 9330, + "task_loss": 0.5367953777313232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009900173489342093, + "compression/movement_sparsity/importance_threshold": -0.7505410689627796, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1759243607521057, + "epoch": 3.38, + "learning_rate": 8.052112770848568e-07, + "loss": 0.1672, + "step": 9340, + "task_loss": 0.2593398094177246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009990299135024385, + "compression/movement_sparsity/importance_threshold": -0.7493134244721338, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17565186321735382, + "epoch": 3.38, + "learning_rate": 8.033555757727237e-07, + "loss": 0.1661, + "step": 9350, + "task_loss": 0.4808293581008911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01008032644931534, + "compression/movement_sparsity/importance_threshold": -0.7480871194003618, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18272629380226135, + "epoch": 3.38, + "learning_rate": 8.015005786028893e-07, + "loss": 0.1827, + "step": 9360, + "task_loss": 0.47488927841186523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010170255485886355, + "compression/movement_sparsity/importance_threshold": -0.7468621530163796, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17263399064540863, + "epoch": 3.39, + "learning_rate": 7.996462922177072e-07, + "loss": 0.1621, + "step": 9370, + "task_loss": 0.4343196749687195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010260086298408851, + "compression/movement_sparsity/importance_threshold": -0.7456385245891034, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16438047587871552, + "epoch": 3.39, + "learning_rate": 7.977927232569877e-07, + "loss": 0.1699, + "step": 9380, + "task_loss": 0.4834184944629669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010349818940554234, + "compression/movement_sparsity/importance_threshold": -0.7444162333874491, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19111394882202148, + "epoch": 3.39, + "learning_rate": 7.959398783579698e-07, + "loss": 0.1637, + "step": 9390, + "task_loss": 0.6939228177070618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01043945346599391, + "compression/movement_sparsity/importance_threshold": -0.7431952786803331, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1576661765575409, + "epoch": 3.4, + "learning_rate": 7.940877641553021e-07, + "loss": 0.1579, + "step": 9400, + "task_loss": 0.4254157543182373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0105289899283993, + "compression/movement_sparsity/importance_threshold": -0.7419756597366711, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13320744037628174, + "epoch": 3.4, + "learning_rate": 7.922363872810159e-07, + "loss": 0.1813, + "step": 9410, + "task_loss": 0.3047106862068176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010618428381441787, + "compression/movement_sparsity/importance_threshold": -0.7407573758253796, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16407856345176697, + "epoch": 3.4, + "learning_rate": 7.903857543645014e-07, + "loss": 0.1708, + "step": 9420, + "task_loss": 0.44201624393463135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010707768878792823, + "compression/movement_sparsity/importance_threshold": -0.7395404262153742, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16521432995796204, + "epoch": 3.41, + "learning_rate": 7.885358720324865e-07, + "loss": 0.1759, + "step": 9430, + "task_loss": 0.3065508008003235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010797011474123798, + "compression/movement_sparsity/importance_threshold": -0.7383248101755712, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1531199812889099, + "epoch": 3.41, + "learning_rate": 7.866867469090096e-07, + "loss": 0.1814, + "step": 9440, + "task_loss": 0.6985189318656921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010886156221106119, + "compression/movement_sparsity/importance_threshold": -0.7371105269748865, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14802789688110352, + "epoch": 3.42, + "learning_rate": 7.848383856153991e-07, + "loss": 0.1597, + "step": 9450, + "task_loss": 0.40684616565704346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010975203173411207, + "compression/movement_sparsity/importance_threshold": -0.7358975758822364, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1340632140636444, + "epoch": 3.42, + "learning_rate": 7.829907947702478e-07, + "loss": 0.1784, + "step": 9460, + "task_loss": 0.5496231317520142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011064152384710467, + "compression/movement_sparsity/importance_threshold": -0.7346859561665366, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12589387595653534, + "epoch": 3.42, + "learning_rate": 7.811439809893896e-07, + "loss": 0.162, + "step": 9470, + "task_loss": 0.3624046742916107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011153003908675308, + "compression/movement_sparsity/importance_threshold": -0.7334756670967038, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1687452495098114, + "epoch": 3.43, + "learning_rate": 7.792979508858765e-07, + "loss": 0.1767, + "step": 9480, + "task_loss": 0.4035765826702118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01124175779897714, + "compression/movement_sparsity/importance_threshold": -0.7322667079416534, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16020125150680542, + "epoch": 3.43, + "learning_rate": 7.774527110699527e-07, + "loss": 0.1603, + "step": 9490, + "task_loss": 0.41458970308303833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011330414109287378, + "compression/movement_sparsity/importance_threshold": -0.7310590779703019, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.132632777094841, + "epoch": 3.43, + "learning_rate": 7.756082681490345e-07, + "loss": 0.1682, + "step": 9500, + "task_loss": 0.3918096423149109 + }, + { + "epoch": 3.43, + "eval_exact_match": 83.57615894039735, + "eval_f1": 89.99568658968761, + "step": 9500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011418972893277427, + "compression/movement_sparsity/importance_threshold": -0.7298527764515652, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14293313026428223, + "epoch": 3.44, + "learning_rate": 7.737646287276834e-07, + "loss": 0.1846, + "step": 9510, + "task_loss": 0.34345972537994385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011507434204618703, + "compression/movement_sparsity/importance_threshold": -0.7286478026543594, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.158808171749115, + "epoch": 3.44, + "learning_rate": 7.719217994075842e-07, + "loss": 0.1825, + "step": 9520, + "task_loss": 0.3061710596084595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011595798096982617, + "compression/movement_sparsity/importance_threshold": -0.7274441558476004, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1777794063091278, + "epoch": 3.44, + "learning_rate": 7.700797867875215e-07, + "loss": 0.1779, + "step": 9530, + "task_loss": 0.43014687299728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011684064624040575, + "compression/movement_sparsity/importance_threshold": -0.7262418353002045, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14890308678150177, + "epoch": 3.45, + "learning_rate": 7.682385974633539e-07, + "loss": 0.1667, + "step": 9540, + "task_loss": 0.4941771626472473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011772233839463965, + "compression/movement_sparsity/importance_threshold": -0.7250408402810881, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13499748706817627, + "epoch": 3.45, + "learning_rate": 7.663982380279936e-07, + "loss": 0.1855, + "step": 9550, + "task_loss": 0.5359944105148315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011860305796924247, + "compression/movement_sparsity/importance_threshold": -0.7238411700591665, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17753419280052185, + "epoch": 3.46, + "learning_rate": 7.645587150713797e-07, + "loss": 0.1758, + "step": 9560, + "task_loss": 0.2920956611633301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011948280550092807, + "compression/movement_sparsity/importance_threshold": -0.722642823903356, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1481466293334961, + "epoch": 3.46, + "learning_rate": 7.627200351804573e-07, + "loss": 0.167, + "step": 9570, + "task_loss": 0.4186212122440338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012036158152641053, + "compression/movement_sparsity/importance_threshold": -0.721445801082573, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1611461639404297, + "epoch": 3.46, + "learning_rate": 7.608822049391522e-07, + "loss": 0.1624, + "step": 9580, + "task_loss": 0.5867197513580322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012123938658240395, + "compression/movement_sparsity/importance_threshold": -0.7202501008657333, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17020216584205627, + "epoch": 3.47, + "learning_rate": 7.590452309283473e-07, + "loss": 0.1777, + "step": 9590, + "task_loss": 0.5484004020690918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012211622120562255, + "compression/movement_sparsity/importance_threshold": -0.7190557225217529, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16823932528495789, + "epoch": 3.47, + "learning_rate": 7.572091197258605e-07, + "loss": 0.1712, + "step": 9600, + "task_loss": 0.42677053809165955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012299208593278026, + "compression/movement_sparsity/importance_threshold": -0.7178626653195481, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1810576617717743, + "epoch": 3.47, + "learning_rate": 7.553738779064191e-07, + "loss": 0.1867, + "step": 9610, + "task_loss": 0.6101388931274414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012386698130059135, + "compression/movement_sparsity/importance_threshold": -0.7166709285280347, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15320083498954773, + "epoch": 3.48, + "learning_rate": 7.53539512041638e-07, + "loss": 0.1972, + "step": 9620, + "task_loss": 0.45958250761032104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012474090784576982, + "compression/movement_sparsity/importance_threshold": -0.7154805114161291, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17546682059764862, + "epoch": 3.48, + "learning_rate": 7.51706028699995e-07, + "loss": 0.1772, + "step": 9630, + "task_loss": 0.44312983751296997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012561386610502979, + "compression/movement_sparsity/importance_threshold": -0.7142914132527471, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1869005411863327, + "epoch": 3.48, + "learning_rate": 7.498734344468085e-07, + "loss": 0.1721, + "step": 9640, + "task_loss": 0.4443809986114502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012648585661508538, + "compression/movement_sparsity/importance_threshold": -0.713103633306805, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21304626762866974, + "epoch": 3.49, + "learning_rate": 7.480417358442131e-07, + "loss": 0.1735, + "step": 9650, + "task_loss": 0.4839943051338196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012735687991265069, + "compression/movement_sparsity/importance_threshold": -0.7119171708472186, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1490074098110199, + "epoch": 3.49, + "learning_rate": 7.462109394511352e-07, + "loss": 0.1827, + "step": 9660, + "task_loss": 0.6885297298431396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01282269365344399, + "compression/movement_sparsity/importance_threshold": -0.7107320251429041, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14718842506408691, + "epoch": 3.49, + "learning_rate": 7.443810518232723e-07, + "loss": 0.1786, + "step": 9670, + "task_loss": 0.34075993299484253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012909602701716701, + "compression/movement_sparsity/importance_threshold": -0.7095481954627776, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2005831003189087, + "epoch": 3.5, + "learning_rate": 7.425520795130658e-07, + "loss": 0.1759, + "step": 9680, + "task_loss": 0.5733194351196289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012996415189754616, + "compression/movement_sparsity/importance_threshold": -0.7083656810757551, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13851837813854218, + "epoch": 3.5, + "learning_rate": 7.407240290696813e-07, + "loss": 0.1682, + "step": 9690, + "task_loss": 0.24197955429553986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013083131171229131, + "compression/movement_sparsity/importance_threshold": -0.707184481250753, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15892581641674042, + "epoch": 3.51, + "learning_rate": 7.388969070389828e-07, + "loss": 0.1781, + "step": 9700, + "task_loss": 0.3478658199310303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013169750699811683, + "compression/movement_sparsity/importance_threshold": -0.7060045952566868, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13390114903450012, + "epoch": 3.51, + "learning_rate": 7.370707199635094e-07, + "loss": 0.1746, + "step": 9710, + "task_loss": 0.46922826766967773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01325627382917368, + "compression/movement_sparsity/importance_threshold": -0.7048260223624727, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14467637240886688, + "epoch": 3.51, + "learning_rate": 7.352454743824531e-07, + "loss": 0.16, + "step": 9720, + "task_loss": 0.2225886881351471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01334270061298652, + "compression/movement_sparsity/importance_threshold": -0.7036487618370271, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14436565339565277, + "epoch": 3.52, + "learning_rate": 7.334211768316338e-07, + "loss": 0.1733, + "step": 9730, + "task_loss": 0.7009656429290771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013429031104921623, + "compression/movement_sparsity/importance_threshold": -0.7024728129492657, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1612703502178192, + "epoch": 3.52, + "learning_rate": 7.315978338434773e-07, + "loss": 0.1794, + "step": 9740, + "task_loss": 0.45459383726119995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013515265358650386, + "compression/movement_sparsity/importance_threshold": -0.7012981749681048, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1257527470588684, + "epoch": 3.52, + "learning_rate": 7.297754519469909e-07, + "loss": 0.1659, + "step": 9750, + "task_loss": 0.6451063752174377 + }, + { + "epoch": 3.52, + "eval_exact_match": 83.50993377483444, + "eval_f1": 89.97423247909468, + "step": 9750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013601403427844234, + "compression/movement_sparsity/importance_threshold": -0.7001248471624604, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1723412126302719, + "epoch": 3.53, + "learning_rate": 7.279540376677407e-07, + "loss": 0.1644, + "step": 9760, + "task_loss": 0.5570217370986938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01368744536617457, + "compression/movement_sparsity/importance_threshold": -0.6989528288012485, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1490807831287384, + "epoch": 3.53, + "learning_rate": 7.263155975197417e-07, + "loss": 0.1819, + "step": 9770, + "task_loss": 0.36897680163383484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0137733912273128, + "compression/movement_sparsity/importance_threshold": -0.6977821191533853, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15225452184677124, + "epoch": 3.53, + "learning_rate": 7.244960396787469e-07, + "loss": 0.1688, + "step": 9780, + "task_loss": 0.27469679713249207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013859241064930345, + "compression/movement_sparsity/importance_threshold": -0.6966127174877867, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.161770761013031, + "epoch": 3.54, + "learning_rate": 7.226774683594532e-07, + "loss": 0.1737, + "step": 9790, + "task_loss": 0.562671422958374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013944994932698615, + "compression/movement_sparsity/importance_threshold": -0.695444623073369, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1080978512763977, + "epoch": 3.54, + "learning_rate": 7.208598900737806e-07, + "loss": 0.1776, + "step": 9800, + "task_loss": 0.15812508761882782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014030652884289011, + "compression/movement_sparsity/importance_threshold": -0.6942778351790481, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12826144695281982, + "epoch": 3.55, + "learning_rate": 7.190433113300952e-07, + "loss": 0.1663, + "step": 9810, + "task_loss": 0.3542582392692566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014116214973372957, + "compression/movement_sparsity/importance_threshold": -0.69311235307374, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16906192898750305, + "epoch": 3.55, + "learning_rate": 7.172277386331832e-07, + "loss": 0.1832, + "step": 9820, + "task_loss": 0.47162115573883057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01420168125362185, + "compression/movement_sparsity/importance_threshold": -0.6919481760263608, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16506068408489227, + "epoch": 3.55, + "learning_rate": 7.154131784842279e-07, + "loss": 0.1837, + "step": 9830, + "task_loss": 0.5075066089630127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014287051778707111, + "compression/movement_sparsity/importance_threshold": -0.6907853033058268, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18274077773094177, + "epoch": 3.56, + "learning_rate": 7.13599637380788e-07, + "loss": 0.1698, + "step": 9840, + "task_loss": 0.5426364541053772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014372326602300119, + "compression/movement_sparsity/importance_threshold": -0.6896237341810542, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1831640601158142, + "epoch": 3.56, + "learning_rate": 7.117871218167716e-07, + "loss": 0.1616, + "step": 9850, + "task_loss": 0.7144515514373779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014457505778072335, + "compression/movement_sparsity/importance_threshold": -0.6884634679209584, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1759902834892273, + "epoch": 3.56, + "learning_rate": 7.09975638282416e-07, + "loss": 0.1769, + "step": 9860, + "task_loss": 0.35797786712646484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014542589359695154, + "compression/movement_sparsity/importance_threshold": -0.6873045037944556, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1537725031375885, + "epoch": 3.57, + "learning_rate": 7.081651932642628e-07, + "loss": 0.1887, + "step": 9870, + "task_loss": 0.23532593250274658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014627577400839973, + "compression/movement_sparsity/importance_threshold": -0.6861468410704624, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17610417306423187, + "epoch": 3.57, + "learning_rate": 7.06355793245134e-07, + "loss": 0.164, + "step": 9880, + "task_loss": 0.4238791763782501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014712469955178206, + "compression/movement_sparsity/importance_threshold": -0.6849904790178946, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.191780686378479, + "epoch": 3.57, + "learning_rate": 7.045474447041106e-07, + "loss": 0.1914, + "step": 9890, + "task_loss": 0.7447193264961243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014797267076381265, + "compression/movement_sparsity/importance_threshold": -0.6838354169056681, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18503516912460327, + "epoch": 3.58, + "learning_rate": 7.027401541165079e-07, + "loss": 0.1899, + "step": 9900, + "task_loss": 0.3769063651561737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014881968818120567, + "compression/movement_sparsity/importance_threshold": -0.6826816540026991, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16122739017009735, + "epoch": 3.58, + "learning_rate": 7.009339279538536e-07, + "loss": 0.1648, + "step": 9910, + "task_loss": 0.8050833344459534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014966575234067515, + "compression/movement_sparsity/importance_threshold": -0.6815291895779036, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17400379478931427, + "epoch": 3.59, + "learning_rate": 6.991287726838628e-07, + "loss": 0.192, + "step": 9920, + "task_loss": 0.508493959903717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015051086377893522, + "compression/movement_sparsity/importance_threshold": -0.6803780229001979, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15540730953216553, + "epoch": 3.59, + "learning_rate": 6.973246947704171e-07, + "loss": 0.1824, + "step": 9930, + "task_loss": 0.4139855206012726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015135502303269998, + "compression/movement_sparsity/importance_threshold": -0.6792281532384978, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15979820489883423, + "epoch": 3.59, + "learning_rate": 6.955217006735398e-07, + "loss": 0.1839, + "step": 9940, + "task_loss": 0.5479603409767151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015219823063868352, + "compression/movement_sparsity/importance_threshold": -0.6780795798617196, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21421319246292114, + "epoch": 3.6, + "learning_rate": 6.937197968493731e-07, + "loss": 0.1756, + "step": 9950, + "task_loss": 0.8114826083183289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01530404871336, + "compression/movement_sparsity/importance_threshold": -0.6769323020387791, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18751713633537292, + "epoch": 3.6, + "learning_rate": 6.919189897501558e-07, + "loss": 0.182, + "step": 9960, + "task_loss": 0.3587406575679779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015388179305416349, + "compression/movement_sparsity/importance_threshold": -0.6757863190385925, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18434390425682068, + "epoch": 3.6, + "learning_rate": 6.901192858241987e-07, + "loss": 0.1779, + "step": 9970, + "task_loss": 0.2552274465560913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01547221489370881, + "compression/movement_sparsity/importance_threshold": -0.6746416301300759, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14075002074241638, + "epoch": 3.61, + "learning_rate": 6.883206915158626e-07, + "loss": 0.1716, + "step": 9980, + "task_loss": 0.3761569857597351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015556155531908776, + "compression/movement_sparsity/importance_threshold": -0.6734982345821456, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19011226296424866, + "epoch": 3.61, + "learning_rate": 6.865232132655361e-07, + "loss": 0.1753, + "step": 9990, + "task_loss": 0.5310980081558228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015640001273687695, + "compression/movement_sparsity/importance_threshold": -0.6723561316637171, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1842782199382782, + "epoch": 3.61, + "learning_rate": 6.847268575096094e-07, + "loss": 0.1791, + "step": 10000, + "task_loss": 0.4414299428462982 + }, + { + "epoch": 3.61, + "eval_exact_match": 83.61400189214758, + "eval_f1": 90.00351389360986, + "step": 10000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015723752172716956, + "compression/movement_sparsity/importance_threshold": -0.6712153206437068, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17395688593387604, + "epoch": 3.62, + "learning_rate": 6.829316306804554e-07, + "loss": 0.1791, + "step": 10010, + "task_loss": 0.4833320379257202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015807408282667967, + "compression/movement_sparsity/importance_threshold": -0.6700758007910308, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13925310969352722, + "epoch": 3.62, + "learning_rate": 6.811375392064027e-07, + "loss": 0.1643, + "step": 10020, + "task_loss": 0.34654271602630615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01589096965721215, + "compression/movement_sparsity/importance_threshold": -0.6689375713746049, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1650667041540146, + "epoch": 3.62, + "learning_rate": 6.793445895117156e-07, + "loss": 0.1744, + "step": 10030, + "task_loss": 0.395840048789978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0159744363500209, + "compression/movement_sparsity/importance_threshold": -0.6678006316633456, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16980409622192383, + "epoch": 3.63, + "learning_rate": 6.775527880165703e-07, + "loss": 0.1817, + "step": 10040, + "task_loss": 0.44540518522262573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016057808414765642, + "compression/movement_sparsity/importance_threshold": -0.6666649809261685, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.181473508477211, + "epoch": 3.63, + "learning_rate": 6.7576214113703e-07, + "loss": 0.1781, + "step": 10050, + "task_loss": 0.4364219307899475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016141085905117782, + "compression/movement_sparsity/importance_threshold": -0.66553061843199, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1557856649160385, + "epoch": 3.64, + "learning_rate": 6.739726552850247e-07, + "loss": 0.1582, + "step": 10060, + "task_loss": 0.31843990087509155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016224268874748718, + "compression/movement_sparsity/importance_threshold": -0.6643975434497262, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1424463987350464, + "epoch": 3.64, + "learning_rate": 6.721843368683263e-07, + "loss": 0.1705, + "step": 10070, + "task_loss": 0.5333583354949951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016307357377329884, + "compression/movement_sparsity/importance_threshold": -0.663265755248293, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17959821224212646, + "epoch": 3.64, + "learning_rate": 6.703971922905274e-07, + "loss": 0.182, + "step": 10080, + "task_loss": 0.6014090776443481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016390351466532678, + "compression/movement_sparsity/importance_threshold": -0.6621352530966064, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1679791957139969, + "epoch": 3.65, + "learning_rate": 6.686112279510157e-07, + "loss": 0.1826, + "step": 10090, + "task_loss": 0.3521926999092102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016473251196028507, + "compression/movement_sparsity/importance_threshold": -0.6610060362635826, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14580708742141724, + "epoch": 3.65, + "learning_rate": 6.668264502449541e-07, + "loss": 0.1765, + "step": 10100, + "task_loss": 0.39751163125038147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016556056619488783, + "compression/movement_sparsity/importance_threshold": -0.6598781040181376, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13637031614780426, + "epoch": 3.65, + "learning_rate": 6.650428655632563e-07, + "loss": 0.1668, + "step": 10110, + "task_loss": 0.51078200340271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01663876779058492, + "compression/movement_sparsity/importance_threshold": -0.6587514556291877, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15260596573352814, + "epoch": 3.66, + "learning_rate": 6.63260480292563e-07, + "loss": 0.1765, + "step": 10120, + "task_loss": 0.2957655191421509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016721384762988314, + "compression/movement_sparsity/importance_threshold": -0.6576260903656488, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1376221477985382, + "epoch": 3.66, + "learning_rate": 6.614793008152212e-07, + "loss": 0.1603, + "step": 10130, + "task_loss": 0.33394917845726013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016803907590370405, + "compression/movement_sparsity/importance_threshold": -0.6565020074964367, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16349947452545166, + "epoch": 3.66, + "learning_rate": 6.596993335092593e-07, + "loss": 0.1774, + "step": 10140, + "task_loss": 0.5207610726356506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016886336326402594, + "compression/movement_sparsity/importance_threshold": -0.6553792062904678, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15152761340141296, + "epoch": 3.67, + "learning_rate": 6.57920584748366e-07, + "loss": 0.1672, + "step": 10150, + "task_loss": 0.5067110657691956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016968671024756275, + "compression/movement_sparsity/importance_threshold": -0.6542576860166581, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15529385209083557, + "epoch": 3.67, + "learning_rate": 6.561430609018667e-07, + "loss": 0.1685, + "step": 10160, + "task_loss": 0.34697654843330383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017050911739102878, + "compression/movement_sparsity/importance_threshold": -0.6531374459439235, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19189101457595825, + "epoch": 3.68, + "learning_rate": 6.543667683346991e-07, + "loss": 0.1928, + "step": 10170, + "task_loss": 0.45649850368499756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0171330585231138, + "compression/movement_sparsity/importance_threshold": -0.6520184853411803, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16808871924877167, + "epoch": 3.68, + "learning_rate": 6.52591713407394e-07, + "loss": 0.1602, + "step": 10180, + "task_loss": 0.37760788202285767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017215111430460457, + "compression/movement_sparsity/importance_threshold": -0.6509008034773445, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1413610577583313, + "epoch": 3.68, + "learning_rate": 6.508179024760487e-07, + "loss": 0.1723, + "step": 10190, + "task_loss": 0.2622392773628235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01729707051481426, + "compression/movement_sparsity/importance_threshold": -0.649784399621332, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1771795153617859, + "epoch": 3.69, + "learning_rate": 6.49045341892308e-07, + "loss": 0.1769, + "step": 10200, + "task_loss": 0.6021856069564819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017378935829846617, + "compression/movement_sparsity/importance_threshold": -0.6486692730420591, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1472265124320984, + "epoch": 3.69, + "learning_rate": 6.47274038003337e-07, + "loss": 0.1614, + "step": 10210, + "task_loss": 0.36952024698257446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017460707429228936, + "compression/movement_sparsity/importance_threshold": -0.6475554230084418, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12765908241271973, + "epoch": 3.69, + "learning_rate": 6.455039971518029e-07, + "loss": 0.1698, + "step": 10220, + "task_loss": 0.42384105920791626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017542385366632635, + "compression/movement_sparsity/importance_threshold": -0.6464428487893962, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17819051444530487, + "epoch": 3.7, + "learning_rate": 6.437352256758495e-07, + "loss": 0.1721, + "step": 10230, + "task_loss": 0.4150475859642029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017623969695729123, + "compression/movement_sparsity/importance_threshold": -0.6453315496538382, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1502448171377182, + "epoch": 3.7, + "learning_rate": 6.419677299090748e-07, + "loss": 0.1764, + "step": 10240, + "task_loss": 0.268942266702652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017705460470189813, + "compression/movement_sparsity/importance_threshold": -0.644221524870684, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22100843489170074, + "epoch": 3.7, + "learning_rate": 6.402015161805097e-07, + "loss": 0.1917, + "step": 10250, + "task_loss": 0.2643333077430725 + }, + { + "epoch": 3.7, + "eval_exact_match": 83.61400189214758, + "eval_f1": 89.94770097613869, + "step": 10250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01778685774368611, + "compression/movement_sparsity/importance_threshold": -0.6431127737088496, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1667742282152176, + "epoch": 3.71, + "learning_rate": 6.384365908145933e-07, + "loss": 0.1676, + "step": 10260, + "task_loss": 0.3106589913368225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017868161569889417, + "compression/movement_sparsity/importance_threshold": -0.6420052954372512, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15289315581321716, + "epoch": 3.71, + "learning_rate": 6.366729601311521e-07, + "loss": 0.1596, + "step": 10270, + "task_loss": 0.24741002917289734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017949372002471143, + "compression/movement_sparsity/importance_threshold": -0.6408990893248049, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17078140377998352, + "epoch": 3.72, + "learning_rate": 6.349106304453769e-07, + "loss": 0.1687, + "step": 10280, + "task_loss": 0.1850242167711258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018030489095102725, + "compression/movement_sparsity/importance_threshold": -0.6397941546404264, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15359310805797577, + "epoch": 3.72, + "learning_rate": 6.331496080677985e-07, + "loss": 0.1632, + "step": 10290, + "task_loss": 0.2576896548271179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01811151290145556, + "compression/movement_sparsity/importance_threshold": -0.6386904906530321, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1645045280456543, + "epoch": 3.72, + "learning_rate": 6.313898993042681e-07, + "loss": 0.177, + "step": 10300, + "task_loss": 0.43283289670944214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018192443475201055, + "compression/movement_sparsity/importance_threshold": -0.6375880966315379, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1887405514717102, + "epoch": 3.73, + "learning_rate": 6.29631510455932e-07, + "loss": 0.1839, + "step": 10310, + "task_loss": 0.36919164657592773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018273280870010622, + "compression/movement_sparsity/importance_threshold": -0.63648697184486, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17336630821228027, + "epoch": 3.73, + "learning_rate": 6.278744478192113e-07, + "loss": 0.172, + "step": 10320, + "task_loss": 0.5593419075012207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018354025139555672, + "compression/movement_sparsity/importance_threshold": -0.6353871155619144, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18160130083560944, + "epoch": 3.73, + "learning_rate": 6.261187176857765e-07, + "loss": 0.17, + "step": 10330, + "task_loss": 0.3647115230560303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018434676337507616, + "compression/movement_sparsity/importance_threshold": -0.6342885270516171, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1591804027557373, + "epoch": 3.74, + "learning_rate": 6.243643263425285e-07, + "loss": 0.1672, + "step": 10340, + "task_loss": 0.6932095289230347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018515234517537865, + "compression/movement_sparsity/importance_threshold": -0.6331912055828843, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17311957478523254, + "epoch": 3.74, + "learning_rate": 6.226112800715733e-07, + "loss": 0.1676, + "step": 10350, + "task_loss": 0.2462514042854309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018595699733317827, + "compression/movement_sparsity/importance_threshold": -0.632095150424632, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14102676510810852, + "epoch": 3.74, + "learning_rate": 6.208595851502003e-07, + "loss": 0.1857, + "step": 10360, + "task_loss": 0.4470866024494171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01867607203851891, + "compression/movement_sparsity/importance_threshold": -0.6310003608457764, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15304112434387207, + "epoch": 3.75, + "learning_rate": 6.191092478508611e-07, + "loss": 0.1751, + "step": 10370, + "task_loss": 0.49549001455307007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018756351486812525, + "compression/movement_sparsity/importance_threshold": -0.6299068361152333, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17790237069129944, + "epoch": 3.75, + "learning_rate": 6.173602744411445e-07, + "loss": 0.1745, + "step": 10380, + "task_loss": 0.38540562987327576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018836538131870098, + "compression/movement_sparsity/importance_threshold": -0.628814575501919, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18003492057323456, + "epoch": 3.75, + "learning_rate": 6.156126711837563e-07, + "loss": 0.1702, + "step": 10390, + "task_loss": 0.5601201057434082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018916632027363017, + "compression/movement_sparsity/importance_threshold": -0.6277235782747494, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19177278876304626, + "epoch": 3.76, + "learning_rate": 6.138664443364964e-07, + "loss": 0.1847, + "step": 10400, + "task_loss": 0.6107884049415588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018996633226962715, + "compression/movement_sparsity/importance_threshold": -0.6266338437026406, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15945954620838165, + "epoch": 3.76, + "learning_rate": 6.121216001522353e-07, + "loss": 0.1776, + "step": 10410, + "task_loss": 0.47822341322898865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019076541784340567, + "compression/movement_sparsity/importance_threshold": -0.625545371054509, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17401601374149323, + "epoch": 3.77, + "learning_rate": 6.103781448788929e-07, + "loss": 0.1751, + "step": 10420, + "task_loss": 0.3851965665817261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019156357753168023, + "compression/movement_sparsity/importance_threshold": -0.6244581595992702, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1592417061328888, + "epoch": 3.77, + "learning_rate": 6.086360847594153e-07, + "loss": 0.1729, + "step": 10430, + "task_loss": 0.5951048135757446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01923608118711648, + "compression/movement_sparsity/importance_threshold": -0.6233722086058404, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17043137550354004, + "epoch": 3.77, + "learning_rate": 6.068954260317535e-07, + "loss": 0.1583, + "step": 10440, + "task_loss": 0.355490505695343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01931571213985735, + "compression/movement_sparsity/importance_threshold": -0.6222875173431357, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1629900336265564, + "epoch": 3.78, + "learning_rate": 6.051561749288404e-07, + "loss": 0.1891, + "step": 10450, + "task_loss": 0.36418741941452026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019395250665062033, + "compression/movement_sparsity/importance_threshold": -0.6212040850800722, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1411941647529602, + "epoch": 3.78, + "learning_rate": 6.034183376785675e-07, + "loss": 0.1646, + "step": 10460, + "task_loss": 0.45510485768318176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019474696816401953, + "compression/movement_sparsity/importance_threshold": -0.620121911085566, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15264548361301422, + "epoch": 3.78, + "learning_rate": 6.016819205037645e-07, + "loss": 0.1679, + "step": 10470, + "task_loss": 0.4522840976715088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019554050647548515, + "compression/movement_sparsity/importance_threshold": -0.6190409946285329, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16963550448417664, + "epoch": 3.79, + "learning_rate": 5.999469296221759e-07, + "loss": 0.1836, + "step": 10480, + "task_loss": 0.4056473970413208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019633312212173126, + "compression/movement_sparsity/importance_threshold": -0.6179613349778894, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15899108350276947, + "epoch": 3.79, + "learning_rate": 5.982133712464392e-07, + "loss": 0.1595, + "step": 10490, + "task_loss": 0.291488379240036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0197124815639472, + "compression/movement_sparsity/importance_threshold": -0.6168829314025512, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15133503079414368, + "epoch": 3.79, + "learning_rate": 5.964812515840616e-07, + "loss": 0.1754, + "step": 10500, + "task_loss": 0.2718963325023651 + }, + { + "epoch": 3.79, + "eval_exact_match": 83.65184484389782, + "eval_f1": 90.02195293125705, + "step": 10500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01979155875654215, + "compression/movement_sparsity/importance_threshold": -0.6158057831714346, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16352495551109314, + "epoch": 3.8, + "learning_rate": 5.947505768373991e-07, + "loss": 0.1724, + "step": 10510, + "task_loss": 0.27979862689971924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01987054384362938, + "compression/movement_sparsity/importance_threshold": -0.6147298895534556, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15177211165428162, + "epoch": 3.8, + "learning_rate": 5.930213532036344e-07, + "loss": 0.1713, + "step": 10520, + "task_loss": 0.4249712824821472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0199494368788803, + "compression/movement_sparsity/importance_threshold": -0.6136552498175303, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1794542670249939, + "epoch": 3.81, + "learning_rate": 5.912935868747525e-07, + "loss": 0.1846, + "step": 10530, + "task_loss": 0.4833950400352478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02002823791596634, + "compression/movement_sparsity/importance_threshold": -0.6125818632325746, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15238790214061737, + "epoch": 3.81, + "learning_rate": 5.895672840375216e-07, + "loss": 0.1683, + "step": 10540, + "task_loss": 0.34725263714790344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02010694700855888, + "compression/movement_sparsity/importance_threshold": -0.6115097290675047, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19352784752845764, + "epoch": 3.81, + "learning_rate": 5.878424508734687e-07, + "loss": 0.1865, + "step": 10550, + "task_loss": 0.4109495282173157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02018556421032935, + "compression/movement_sparsity/importance_threshold": -0.6104388465912367, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1467682123184204, + "epoch": 3.82, + "learning_rate": 5.861190935588583e-07, + "loss": 0.1751, + "step": 10560, + "task_loss": 0.39710533618927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.020264089574949143, + "compression/movement_sparsity/importance_threshold": -0.6093692150726868, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17885571718215942, + "epoch": 3.82, + "learning_rate": 5.843972182646706e-07, + "loss": 0.1743, + "step": 10570, + "task_loss": 0.4837023913860321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0203425231560897, + "compression/movement_sparsity/importance_threshold": -0.6083008337807706, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1784561276435852, + "epoch": 3.82, + "learning_rate": 5.826768311565777e-07, + "loss": 0.1766, + "step": 10580, + "task_loss": 0.4210522174835205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02042086500742242, + "compression/movement_sparsity/importance_threshold": -0.6072337019844045, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14249750971794128, + "epoch": 3.83, + "learning_rate": 5.809579383949251e-07, + "loss": 0.1779, + "step": 10590, + "task_loss": 0.330593079328537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0204991151826187, + "compression/movement_sparsity/importance_threshold": -0.6061678189525045, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17461369931697845, + "epoch": 3.83, + "learning_rate": 5.792405461347049e-07, + "loss": 0.1777, + "step": 10600, + "task_loss": 0.40263602137565613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02057727373534996, + "compression/movement_sparsity/importance_threshold": -0.6051031839539867, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18333062529563904, + "epoch": 3.83, + "learning_rate": 5.775246605255384e-07, + "loss": 0.181, + "step": 10610, + "task_loss": 0.541496992111206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.020655340719287613, + "compression/movement_sparsity/importance_threshold": -0.6040397962577672, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18711034953594208, + "epoch": 3.84, + "learning_rate": 5.758102877116498e-07, + "loss": 0.1763, + "step": 10620, + "task_loss": 0.3023425340652466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.020733316188103066, + "compression/movement_sparsity/importance_threshold": -0.6029776551327619, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16456711292266846, + "epoch": 3.84, + "learning_rate": 5.740974338318476e-07, + "loss": 0.1626, + "step": 10630, + "task_loss": 0.5074876546859741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.020811200195467734, + "compression/movement_sparsity/importance_threshold": -0.601916759847887, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14603650569915771, + "epoch": 3.85, + "learning_rate": 5.723861050195018e-07, + "loss": 0.1859, + "step": 10640, + "task_loss": 0.5426656007766724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02088899279505301, + "compression/movement_sparsity/importance_threshold": -0.6008571096720587, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19825714826583862, + "epoch": 3.85, + "learning_rate": 5.706763074025198e-07, + "loss": 0.1859, + "step": 10650, + "task_loss": 0.45735934376716614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02096669404053033, + "compression/movement_sparsity/importance_threshold": -0.5997987038741928, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12421312928199768, + "epoch": 3.85, + "learning_rate": 5.689680471033278e-07, + "loss": 0.1666, + "step": 10660, + "task_loss": 0.292208194732666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021044303985571094, + "compression/movement_sparsity/importance_threshold": -0.5987415417232054, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1720031499862671, + "epoch": 3.86, + "learning_rate": 5.672613302388456e-07, + "loss": 0.1797, + "step": 10670, + "task_loss": 0.4465997815132141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021121822683846707, + "compression/movement_sparsity/importance_threshold": -0.5976856224880127, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13473817706108093, + "epoch": 3.86, + "learning_rate": 5.655561629204675e-07, + "loss": 0.1779, + "step": 10680, + "task_loss": 0.5434313416481018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021199250189028584, + "compression/movement_sparsity/importance_threshold": -0.5966309454375308, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1842292696237564, + "epoch": 3.86, + "learning_rate": 5.638525512540389e-07, + "loss": 0.1743, + "step": 10690, + "task_loss": 0.846764326095581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021276586554788132, + "compression/movement_sparsity/importance_threshold": -0.5955775098406757, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12809939682483673, + "epoch": 3.87, + "learning_rate": 5.621505013398344e-07, + "loss": 0.1692, + "step": 10700, + "task_loss": 0.359587699174881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021353831834796756, + "compression/movement_sparsity/importance_threshold": -0.5945253149663636, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17134542763233185, + "epoch": 3.87, + "learning_rate": 5.604500192725374e-07, + "loss": 0.184, + "step": 10710, + "task_loss": 0.6001778841018677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021430986082725884, + "compression/movement_sparsity/importance_threshold": -0.5934743600835102, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1271722912788391, + "epoch": 3.87, + "learning_rate": 5.587511111412151e-07, + "loss": 0.1631, + "step": 10720, + "task_loss": 0.2941231429576874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021508049352246924, + "compression/movement_sparsity/importance_threshold": -0.5924246444610318, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14123263955116272, + "epoch": 3.88, + "learning_rate": 5.570537830293006e-07, + "loss": 0.1617, + "step": 10730, + "task_loss": 0.2916907072067261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02158502169703128, + "compression/movement_sparsity/importance_threshold": -0.5913761673678445, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13426288962364197, + "epoch": 3.88, + "learning_rate": 5.553580410145688e-07, + "loss": 0.1659, + "step": 10740, + "task_loss": 0.5448470115661621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02166190317075037, + "compression/movement_sparsity/importance_threshold": -0.5903289280728642, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13951101899147034, + "epoch": 3.89, + "learning_rate": 5.536638911691143e-07, + "loss": 0.1793, + "step": 10750, + "task_loss": 0.40928834676742554 + }, + { + "epoch": 3.89, + "eval_exact_match": 83.57615894039735, + "eval_f1": 89.997904877457, + "step": 10750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02173869382707558, + "compression/movement_sparsity/importance_threshold": -0.5892829258450072, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1252758651971817, + "epoch": 3.89, + "learning_rate": 5.519713395593321e-07, + "loss": 0.172, + "step": 10760, + "task_loss": 0.2951388359069824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021815393719678354, + "compression/movement_sparsity/importance_threshold": -0.5882381599531894, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2374802976846695, + "epoch": 3.89, + "learning_rate": 5.502803922458924e-07, + "loss": 0.1794, + "step": 10770, + "task_loss": 0.4524235725402832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021892002902230083, + "compression/movement_sparsity/importance_threshold": -0.5871946296663269, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19318094849586487, + "epoch": 3.9, + "learning_rate": 5.485910552837225e-07, + "loss": 0.176, + "step": 10780, + "task_loss": 0.4039768576622009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021968521428402183, + "compression/movement_sparsity/importance_threshold": -0.5861523342533359, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13738399744033813, + "epoch": 3.9, + "learning_rate": 5.469033347219816e-07, + "loss": 0.1636, + "step": 10790, + "task_loss": 0.5032458901405334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02204494935186606, + "compression/movement_sparsity/importance_threshold": -0.5851112729831323, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17375808954238892, + "epoch": 3.9, + "learning_rate": 5.452172366040423e-07, + "loss": 0.1675, + "step": 10800, + "task_loss": 0.6967633962631226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022121286726293127, + "compression/movement_sparsity/importance_threshold": -0.5840714451246322, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17048680782318115, + "epoch": 3.91, + "learning_rate": 5.435327669674672e-07, + "loss": 0.1687, + "step": 10810, + "task_loss": 0.30930009484291077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022197533605354803, + "compression/movement_sparsity/importance_threshold": -0.5830328499467516, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1512829065322876, + "epoch": 3.91, + "learning_rate": 5.418499318439875e-07, + "loss": 0.1747, + "step": 10820, + "task_loss": 0.35338839888572693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022273690042722484, + "compression/movement_sparsity/importance_threshold": -0.5819954867184067, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.186394602060318, + "epoch": 3.91, + "learning_rate": 5.401687372594819e-07, + "loss": 0.1737, + "step": 10830, + "task_loss": 0.25927868485450745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022349756092067595, + "compression/movement_sparsity/importance_threshold": -0.5809593547085136, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20156210660934448, + "epoch": 3.92, + "learning_rate": 5.384891892339539e-07, + "loss": 0.1807, + "step": 10840, + "task_loss": 0.5239515900611877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022425731807061536, + "compression/movement_sparsity/importance_threshold": -0.5799244531859882, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15591740608215332, + "epoch": 3.92, + "learning_rate": 5.368112937815114e-07, + "loss": 0.1818, + "step": 10850, + "task_loss": 0.2408059537410736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022501617241375716, + "compression/movement_sparsity/importance_threshold": -0.5788907814197467, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.147825688123703, + "epoch": 3.92, + "learning_rate": 5.351350569103459e-07, + "loss": 0.1575, + "step": 10860, + "task_loss": 0.3448067009449005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022577412448681538, + "compression/movement_sparsity/importance_threshold": -0.5778583386787053, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21096472442150116, + "epoch": 3.93, + "learning_rate": 5.334604846227077e-07, + "loss": 0.1845, + "step": 10870, + "task_loss": 0.7965984344482422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022653117482650437, + "compression/movement_sparsity/importance_threshold": -0.5768271242317797, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18514427542686462, + "epoch": 3.93, + "learning_rate": 5.317875829148885e-07, + "loss": 0.1687, + "step": 10880, + "task_loss": 0.43882808089256287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02272873239695382, + "compression/movement_sparsity/importance_threshold": -0.5757971373478863, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1756231188774109, + "epoch": 3.94, + "learning_rate": 5.301163577771966e-07, + "loss": 0.1893, + "step": 10890, + "task_loss": 0.36089468002319336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02280425724526309, + "compression/movement_sparsity/importance_threshold": -0.5747683772959408, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13075178861618042, + "epoch": 3.94, + "learning_rate": 5.284468151939383e-07, + "loss": 0.178, + "step": 10900, + "task_loss": 0.5520082712173462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022879692081249646, + "compression/movement_sparsity/importance_threshold": -0.5737408433448596, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15874437987804413, + "epoch": 3.94, + "learning_rate": 5.267789611433934e-07, + "loss": 0.1804, + "step": 10910, + "task_loss": 0.3404083847999573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022955036958584914, + "compression/movement_sparsity/importance_threshold": -0.5727145347635587, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12918061017990112, + "epoch": 3.95, + "learning_rate": 5.251128015977966e-07, + "loss": 0.1761, + "step": 10920, + "task_loss": 0.1778661012649536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02303029193094031, + "compression/movement_sparsity/importance_threshold": -0.5716894508209539, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17667776346206665, + "epoch": 3.95, + "learning_rate": 5.234483425233145e-07, + "loss": 0.1753, + "step": 10930, + "task_loss": 0.5118983387947083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023105457051987223, + "compression/movement_sparsity/importance_threshold": -0.5706655907859617, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18898850679397583, + "epoch": 3.95, + "learning_rate": 5.217855898800249e-07, + "loss": 0.1735, + "step": 10940, + "task_loss": 0.4475729465484619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023180532375397078, + "compression/movement_sparsity/importance_threshold": -0.5696429539274979, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1563543677330017, + "epoch": 3.96, + "learning_rate": 5.201245496218955e-07, + "loss": 0.1829, + "step": 10950, + "task_loss": 0.5748417377471924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023255517954841286, + "compression/movement_sparsity/importance_threshold": -0.5686215395144786, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1615605354309082, + "epoch": 3.96, + "learning_rate": 5.18465227696761e-07, + "loss": 0.18, + "step": 10960, + "task_loss": 0.3879707455635071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02333041384399125, + "compression/movement_sparsity/importance_threshold": -0.56760134681582, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16784200072288513, + "epoch": 3.96, + "learning_rate": 5.168076300463044e-07, + "loss": 0.1823, + "step": 10970, + "task_loss": 0.72287917137146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02340522009651837, + "compression/movement_sparsity/importance_threshold": -0.5665823751004382, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17279008030891418, + "epoch": 3.97, + "learning_rate": 5.151517626060346e-07, + "loss": 0.1768, + "step": 10980, + "task_loss": 0.4392220973968506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023479936766094087, + "compression/movement_sparsity/importance_threshold": -0.5655646236372489, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16514915227890015, + "epoch": 3.97, + "learning_rate": 5.134976313052633e-07, + "loss": 0.1753, + "step": 10990, + "task_loss": 0.7647157907485962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023554563906389804, + "compression/movement_sparsity/importance_threshold": -0.5645480916951683, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15675148367881775, + "epoch": 3.98, + "learning_rate": 5.118452420670876e-07, + "loss": 0.1832, + "step": 11000, + "task_loss": 0.43583187460899353 + }, + { + "epoch": 3.98, + "eval_exact_match": 83.6329233680227, + "eval_f1": 90.05063824812734, + "step": 11000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023629101571076915, + "compression/movement_sparsity/importance_threshold": -0.5635327785431127, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14564642310142517, + "epoch": 3.98, + "learning_rate": 5.101946008083647e-07, + "loss": 0.1726, + "step": 11010, + "task_loss": 0.4629361033439636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023703549813826847, + "compression/movement_sparsity/importance_threshold": -0.5625186834499979, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13881969451904297, + "epoch": 3.98, + "learning_rate": 5.085457134396945e-07, + "loss": 0.1657, + "step": 11020, + "task_loss": 0.4461815357208252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023777908688310995, + "compression/movement_sparsity/importance_threshold": -0.5615058056847402, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16499027609825134, + "epoch": 3.99, + "learning_rate": 5.068985858653947e-07, + "loss": 0.1757, + "step": 11030, + "task_loss": 0.237386554479599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02385217824820078, + "compression/movement_sparsity/importance_threshold": -0.5604941445162555, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14505073428153992, + "epoch": 3.99, + "learning_rate": 5.052532239834831e-07, + "loss": 0.1756, + "step": 11040, + "task_loss": 0.3913651406764984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02392635854716761, + "compression/movement_sparsity/importance_threshold": -0.55948369921346, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16279101371765137, + "epoch": 3.99, + "learning_rate": 5.036096336856539e-07, + "loss": 0.1736, + "step": 11050, + "task_loss": 0.2173888087272644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0240004496388829, + "compression/movement_sparsity/importance_threshold": -0.5584744690452697, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19913247227668762, + "epoch": 4.0, + "learning_rate": 5.019678208572585e-07, + "loss": 0.1781, + "step": 11060, + "task_loss": 0.4576454758644104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024074451577018048, + "compression/movement_sparsity/importance_threshold": -0.5574664532806006, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1983916163444519, + "epoch": 4.0, + "learning_rate": 5.003277913772834e-07, + "loss": 0.1729, + "step": 11070, + "task_loss": 0.3344120383262634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024148364415244483, + "compression/movement_sparsity/importance_threshold": -0.5564596511883687, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13938963413238525, + "epoch": 4.0, + "learning_rate": 4.986895511183282e-07, + "loss": 0.1749, + "step": 11080, + "task_loss": 0.34267091751098633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024222188207233597, + "compression/movement_sparsity/importance_threshold": -0.5554540620374904, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15878230333328247, + "epoch": 4.01, + "learning_rate": 4.970531059465865e-07, + "loss": 0.1706, + "step": 11090, + "task_loss": 0.41367635130882263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024295923006656818, + "compression/movement_sparsity/importance_threshold": -0.5544496850968814, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18763568997383118, + "epoch": 4.01, + "learning_rate": 4.954184617218251e-07, + "loss": 0.1847, + "step": 11100, + "task_loss": 0.38458868861198425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024369568867185543, + "compression/movement_sparsity/importance_threshold": -0.5534465196354581, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1669691503047943, + "epoch": 4.02, + "learning_rate": 4.937856242973598e-07, + "loss": 0.1704, + "step": 11110, + "task_loss": 0.40393808484077454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024443125842491187, + "compression/movement_sparsity/importance_threshold": -0.5524445649221362, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1612987220287323, + "epoch": 4.02, + "learning_rate": 4.921545995200387e-07, + "loss": 0.1749, + "step": 11120, + "task_loss": 0.3647257089614868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024516593986245158, + "compression/movement_sparsity/importance_threshold": -0.5514438202258322, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1850115805864334, + "epoch": 4.02, + "learning_rate": 4.905253932302173e-07, + "loss": 0.1773, + "step": 11130, + "task_loss": 0.6843971014022827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02458997335211887, + "compression/movement_sparsity/importance_threshold": -0.5504442848154618, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19363956153392792, + "epoch": 4.03, + "learning_rate": 4.88898011261741e-07, + "loss": 0.1652, + "step": 11140, + "task_loss": 0.40217435359954834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024663263993783737, + "compression/movement_sparsity/importance_threshold": -0.5494459579599411, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1595451831817627, + "epoch": 4.03, + "learning_rate": 4.872724594419225e-07, + "loss": 0.1616, + "step": 11150, + "task_loss": 0.29367250204086304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024736465964911164, + "compression/movement_sparsity/importance_threshold": -0.5484488389281864, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16744457185268402, + "epoch": 4.03, + "learning_rate": 4.8564874359152e-07, + "loss": 0.1756, + "step": 11160, + "task_loss": 0.3866707682609558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02480957931917255, + "compression/movement_sparsity/importance_threshold": -0.5474529269891137, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1353551745414734, + "epoch": 4.04, + "learning_rate": 4.840268695247185e-07, + "loss": 0.1662, + "step": 11170, + "task_loss": 0.4497717022895813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02488260411023933, + "compression/movement_sparsity/importance_threshold": -0.5464582214116389, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16634535789489746, + "epoch": 4.04, + "learning_rate": 4.824068430491079e-07, + "loss": 0.1605, + "step": 11180, + "task_loss": 0.4282223582267761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02495554039178291, + "compression/movement_sparsity/importance_threshold": -0.5454647214646782, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16130493581295013, + "epoch": 4.04, + "learning_rate": 4.807886699656621e-07, + "loss": 0.1793, + "step": 11190, + "task_loss": 0.6635265350341797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025028388217474686, + "compression/movement_sparsity/importance_threshold": -0.5444724264171475, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16918551921844482, + "epoch": 4.05, + "learning_rate": 4.791723560687181e-07, + "loss": 0.1759, + "step": 11200, + "task_loss": 0.30166515707969666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02510114764098608, + "compression/movement_sparsity/importance_threshold": -0.5434813355379631, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14573919773101807, + "epoch": 4.05, + "learning_rate": 4.775579071459558e-07, + "loss": 0.1725, + "step": 11210, + "task_loss": 0.3856244385242462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0251738187159885, + "compression/movement_sparsity/importance_threshold": -0.542491448096041, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14356495440006256, + "epoch": 4.05, + "learning_rate": 4.759453289783776e-07, + "loss": 0.1684, + "step": 11220, + "task_loss": 0.2242923080921173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025246401496153356, + "compression/movement_sparsity/importance_threshold": -0.5415027633602971, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18279266357421875, + "epoch": 4.06, + "learning_rate": 4.7433462734028563e-07, + "loss": 0.1697, + "step": 11230, + "task_loss": 0.33016282320022583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025318896035152062, + "compression/movement_sparsity/importance_threshold": -0.5405152805996476, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13953736424446106, + "epoch": 4.06, + "learning_rate": 4.727258079992643e-07, + "loss": 0.1646, + "step": 11240, + "task_loss": 0.4277157187461853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02539130238665602, + "compression/movement_sparsity/importance_threshold": -0.5395289990830087, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1368265450000763, + "epoch": 4.07, + "learning_rate": 4.7111887671615635e-07, + "loss": 0.1765, + "step": 11250, + "task_loss": 0.3417168855667114 + }, + { + "epoch": 4.07, + "eval_exact_match": 83.65184484389782, + "eval_f1": 90.03112152658635, + "step": 11250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025463620604336632, + "compression/movement_sparsity/importance_threshold": -0.5385439180792964, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13024994730949402, + "epoch": 4.07, + "learning_rate": 4.6951383924504486e-07, + "loss": 0.1765, + "step": 11260, + "task_loss": 0.21580921113491058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02553585074186534, + "compression/movement_sparsity/importance_threshold": -0.5375600368574266, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15584993362426758, + "epoch": 4.07, + "learning_rate": 4.679107013332316e-07, + "loss": 0.1693, + "step": 11270, + "task_loss": 0.18834367394447327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02560799285291353, + "compression/movement_sparsity/importance_threshold": -0.5365773546863154, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14209944009780884, + "epoch": 4.08, + "learning_rate": 4.6630946872121534e-07, + "loss": 0.1894, + "step": 11280, + "task_loss": 0.28304344415664673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025680046991152623, + "compression/movement_sparsity/importance_threshold": -0.5355958708348789, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1544867753982544, + "epoch": 4.08, + "learning_rate": 4.6471014714267353e-07, + "loss": 0.1669, + "step": 11290, + "task_loss": 0.34882280230522156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025752013210254023, + "compression/movement_sparsity/importance_threshold": -0.5346155845720333, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15384814143180847, + "epoch": 4.08, + "learning_rate": 4.6311274232443984e-07, + "loss": 0.1769, + "step": 11300, + "task_loss": 0.6888493299484253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02582389156388915, + "compression/movement_sparsity/importance_threshold": -0.5336364951666945, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14584749937057495, + "epoch": 4.09, + "learning_rate": 4.615172599864855e-07, + "loss": 0.1893, + "step": 11310, + "task_loss": 0.42334309220314026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0258956821057294, + "compression/movement_sparsity/importance_threshold": -0.5326586018877787, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17131412029266357, + "epoch": 4.09, + "learning_rate": 4.59923705841896e-07, + "loss": 0.1795, + "step": 11320, + "task_loss": 0.6292405128479004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0259673848894462, + "compression/movement_sparsity/importance_threshold": -0.5316819040042018, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1550251841545105, + "epoch": 4.09, + "learning_rate": 4.5833208559685377e-07, + "loss": 0.179, + "step": 11330, + "task_loss": 0.7897596955299377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026038999968710952, + "compression/movement_sparsity/importance_threshold": -0.5307064007848801, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1730719804763794, + "epoch": 4.1, + "learning_rate": 4.5674240495061643e-07, + "loss": 0.1749, + "step": 11340, + "task_loss": 0.46178561449050903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02611052739719506, + "compression/movement_sparsity/importance_threshold": -0.5297320914987294, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19655971229076385, + "epoch": 4.1, + "learning_rate": 4.5515466959549486e-07, + "loss": 0.1767, + "step": 11350, + "task_loss": 0.7543710470199585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026181967228569945, + "compression/movement_sparsity/importance_threshold": -0.528758975414666, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12220363318920135, + "epoch": 4.11, + "learning_rate": 4.5356888521683613e-07, + "loss": 0.1517, + "step": 11360, + "task_loss": 0.3255242705345154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02625331951650702, + "compression/movement_sparsity/importance_threshold": -0.5277870518016057, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15358072519302368, + "epoch": 4.11, + "learning_rate": 4.519850574929996e-07, + "loss": 0.1836, + "step": 11370, + "task_loss": 0.28549590706825256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02632458431467768, + "compression/movement_sparsity/importance_threshold": -0.5268163199284649, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16602346301078796, + "epoch": 4.11, + "learning_rate": 4.504031920953394e-07, + "loss": 0.1814, + "step": 11380, + "task_loss": 0.48657315969467163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02639576167675335, + "compression/movement_sparsity/importance_threshold": -0.5258467790641594, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12628331780433655, + "epoch": 4.12, + "learning_rate": 4.4882329468818246e-07, + "loss": 0.1836, + "step": 11390, + "task_loss": 0.2578519582748413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026466851656405437, + "compression/movement_sparsity/importance_threshold": -0.5248784284776054, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15595079958438873, + "epoch": 4.12, + "learning_rate": 4.472453709288091e-07, + "loss": 0.1813, + "step": 11400, + "task_loss": 0.5081670880317688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026537854307305344, + "compression/movement_sparsity/importance_threshold": -0.523911267437719, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16610749065876007, + "epoch": 4.12, + "learning_rate": 4.4566942646743246e-07, + "loss": 0.1716, + "step": 11410, + "task_loss": 0.43464046716690063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026608769683124495, + "compression/movement_sparsity/importance_threshold": -0.5229452952134162, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17217926681041718, + "epoch": 4.13, + "learning_rate": 4.4409546694717736e-07, + "loss": 0.1618, + "step": 11420, + "task_loss": 0.38427919149398804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026679597837534295, + "compression/movement_sparsity/importance_threshold": -0.5219805110736129, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14132100343704224, + "epoch": 4.13, + "learning_rate": 4.425234980040622e-07, + "loss": 0.1756, + "step": 11430, + "task_loss": 0.49500393867492676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026750338824206146, + "compression/movement_sparsity/importance_threshold": -0.5210169142872255, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.180766299366951, + "epoch": 4.13, + "learning_rate": 4.409535252669763e-07, + "loss": 0.1809, + "step": 11440, + "task_loss": 0.47744685411453247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026820992696811475, + "compression/movement_sparsity/importance_threshold": -0.5200545041231699, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16767911612987518, + "epoch": 4.14, + "learning_rate": 4.3938555435766187e-07, + "loss": 0.172, + "step": 11450, + "task_loss": 0.4917362332344055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026891559509021664, + "compression/movement_sparsity/importance_threshold": -0.5190932798503622, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16706699132919312, + "epoch": 4.14, + "learning_rate": 4.378195908906932e-07, + "loss": 0.1763, + "step": 11460, + "task_loss": 0.25731348991394043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026962039314508152, + "compression/movement_sparsity/importance_threshold": -0.5181332407377185, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1750440001487732, + "epoch": 4.15, + "learning_rate": 4.362556404734552e-07, + "loss": 0.1642, + "step": 11470, + "task_loss": 0.6891375184059143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02703243216694235, + "compression/movement_sparsity/importance_threshold": -0.5171743860541547, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1234075054526329, + "epoch": 4.15, + "learning_rate": 4.346937087061259e-07, + "loss": 0.1645, + "step": 11480, + "task_loss": 0.4213874936103821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027102738119995654, + "compression/movement_sparsity/importance_threshold": -0.5162167150685868, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1771700531244278, + "epoch": 4.15, + "learning_rate": 4.3313380118165345e-07, + "loss": 0.1784, + "step": 11490, + "task_loss": 0.4768829345703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02717295722733948, + "compression/movement_sparsity/importance_threshold": -0.5152602270499311, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17060580849647522, + "epoch": 4.16, + "learning_rate": 4.315759234857388e-07, + "loss": 0.1732, + "step": 11500, + "task_loss": 0.7726600170135498 + }, + { + "epoch": 4.16, + "eval_exact_match": 83.6802270577105, + "eval_f1": 90.00678546584051, + "step": 11500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02724308954264524, + "compression/movement_sparsity/importance_threshold": -0.5143049212671038, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19148531556129456, + "epoch": 4.16, + "learning_rate": 4.300200811968141e-07, + "loss": 0.1736, + "step": 11510, + "task_loss": 0.3507334887981415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02731313511958434, + "compression/movement_sparsity/importance_threshold": -0.5133507969890206, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1210765391588211, + "epoch": 4.16, + "learning_rate": 4.284662798860232e-07, + "loss": 0.1682, + "step": 11520, + "task_loss": 0.20562471449375153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027383094011828195, + "compression/movement_sparsity/importance_threshold": -0.5123978534845979, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14142274856567383, + "epoch": 4.17, + "learning_rate": 4.2691452511720194e-07, + "loss": 0.1691, + "step": 11530, + "task_loss": 0.3383503556251526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027452966273048215, + "compression/movement_sparsity/importance_threshold": -0.5114460900227514, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1510867178440094, + "epoch": 4.17, + "learning_rate": 4.253648224468567e-07, + "loss": 0.1818, + "step": 11540, + "task_loss": 0.7619709968566895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027522751956915795, + "compression/movement_sparsity/importance_threshold": -0.5104955058723977, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17000426352024078, + "epoch": 4.17, + "learning_rate": 4.238171774241471e-07, + "loss": 0.1604, + "step": 11550, + "task_loss": 0.3821756839752197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027592451117102377, + "compression/movement_sparsity/importance_threshold": -0.5095461003024523, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12718622386455536, + "epoch": 4.18, + "learning_rate": 4.2227159559086466e-07, + "loss": 0.1767, + "step": 11560, + "task_loss": 0.3057350814342499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02766206380727935, + "compression/movement_sparsity/importance_threshold": -0.5085978725818315, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13876132667064667, + "epoch": 4.18, + "learning_rate": 4.207280824814119e-07, + "loss": 0.1755, + "step": 11570, + "task_loss": 0.48780569434165955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027731590081118137, + "compression/movement_sparsity/importance_threshold": -0.5076508219794513, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15152356028556824, + "epoch": 4.19, + "learning_rate": 4.191866436227851e-07, + "loss": 0.1602, + "step": 11580, + "task_loss": 0.29727867245674133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027801029992290137, + "compression/movement_sparsity/importance_threshold": -0.506704947764228, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17890770733356476, + "epoch": 4.19, + "learning_rate": 4.1764728453455167e-07, + "loss": 0.1692, + "step": 11590, + "task_loss": 0.46190762519836426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027870383594466763, + "compression/movement_sparsity/importance_threshold": -0.5057602492050775, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16518770158290863, + "epoch": 4.19, + "learning_rate": 4.1611001072883323e-07, + "loss": 0.1793, + "step": 11600, + "task_loss": 0.6467550992965698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02793965094131943, + "compression/movement_sparsity/importance_threshold": -0.5048167255709157, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12185207009315491, + "epoch": 4.2, + "learning_rate": 4.1457482771028305e-07, + "loss": 0.1653, + "step": 11610, + "task_loss": 0.3818345069885254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02800883208651955, + "compression/movement_sparsity/importance_threshold": -0.5038743761306589, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1789764165878296, + "epoch": 4.2, + "learning_rate": 4.130417409760689e-07, + "loss": 0.1788, + "step": 11620, + "task_loss": 0.6418240666389465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02807792708373852, + "compression/movement_sparsity/importance_threshold": -0.5029332001532232, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1454584151506424, + "epoch": 4.2, + "learning_rate": 4.1151075601585174e-07, + "loss": 0.1828, + "step": 11630, + "task_loss": 0.43138134479522705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02814693598664776, + "compression/movement_sparsity/importance_threshold": -0.5019931969075245, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16067031025886536, + "epoch": 4.21, + "learning_rate": 4.0998187831176636e-07, + "loss": 0.1775, + "step": 11640, + "task_loss": 0.32413339614868164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02821585884891869, + "compression/movement_sparsity/importance_threshold": -0.5010543656624789, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17555850744247437, + "epoch": 4.21, + "learning_rate": 4.084551133384024e-07, + "loss": 0.1691, + "step": 11650, + "task_loss": 0.5207797884941101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028284695724222705, + "compression/movement_sparsity/importance_threshold": -0.5001167056870026, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12386974692344666, + "epoch": 4.21, + "learning_rate": 4.069304665627834e-07, + "loss": 0.1682, + "step": 11660, + "task_loss": 0.26369139552116394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028353446666231224, + "compression/movement_sparsity/importance_threshold": -0.4991802162500115, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15394540131092072, + "epoch": 4.22, + "learning_rate": 4.054079434443487e-07, + "loss": 0.1543, + "step": 11670, + "task_loss": 0.4621257781982422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028422111728615654, + "compression/movement_sparsity/importance_threshold": -0.4982448966204218, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15610939264297485, + "epoch": 4.22, + "learning_rate": 4.0388754943493374e-07, + "loss": 0.174, + "step": 11680, + "task_loss": 0.7012438774108887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02849069096504741, + "compression/movement_sparsity/importance_threshold": -0.4973107460671495, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1879960000514984, + "epoch": 4.22, + "learning_rate": 4.023692899787486e-07, + "loss": 0.181, + "step": 11690, + "task_loss": 0.5456829071044922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028559184429197897, + "compression/movement_sparsity/importance_threshold": -0.49637776385911064, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16950491070747375, + "epoch": 4.23, + "learning_rate": 4.0085317051236176e-07, + "loss": 0.1782, + "step": 11700, + "task_loss": 0.5510709881782532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028627592174738523, + "compression/movement_sparsity/importance_threshold": -0.4954459492652214, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14448924362659454, + "epoch": 4.23, + "learning_rate": 3.9933919646467716e-07, + "loss": 0.1733, + "step": 11710, + "task_loss": 0.4822184443473816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028695914255340713, + "compression/movement_sparsity/importance_threshold": -0.49451530155439766, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1638861894607544, + "epoch": 4.24, + "learning_rate": 3.9782737325691786e-07, + "loss": 0.1785, + "step": 11720, + "task_loss": 0.3740285038948059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02876415072467586, + "compression/movement_sparsity/importance_threshold": -0.49358581999555573, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12357451766729355, + "epoch": 4.24, + "learning_rate": 3.96317706302604e-07, + "loss": 0.1757, + "step": 11730, + "task_loss": 0.7518565654754639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028832301636415394, + "compression/movement_sparsity/importance_threshold": -0.49265750385761137, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11722180247306824, + "epoch": 4.24, + "learning_rate": 3.948102010075356e-07, + "loss": 0.1569, + "step": 11740, + "task_loss": 0.4631866216659546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02890036704423069, + "compression/movement_sparsity/importance_threshold": -0.49173035240948115, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.23655325174331665, + "epoch": 4.25, + "learning_rate": 3.933048627697717e-07, + "loss": 0.1875, + "step": 11750, + "task_loss": 0.260358989238739 + }, + { + "epoch": 4.25, + "eval_exact_match": 83.80321665089878, + "eval_f1": 90.13903938703879, + "step": 11750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028968347001793203, + "compression/movement_sparsity/importance_threshold": -0.49080436492008056, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21262499690055847, + "epoch": 4.25, + "learning_rate": 3.9180169697961183e-07, + "loss": 0.1815, + "step": 11760, + "task_loss": 0.3771222233772278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02903624156277432, + "compression/movement_sparsity/importance_threshold": -0.4898795406583259, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14260724186897278, + "epoch": 4.25, + "learning_rate": 3.903007090195768e-07, + "loss": 0.1721, + "step": 11770, + "task_loss": 0.404729425907135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029104050780845457, + "compression/movement_sparsity/importance_threshold": -0.4889558788931333, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18275782465934753, + "epoch": 4.26, + "learning_rate": 3.8880190426438764e-07, + "loss": 0.1657, + "step": 11780, + "task_loss": 0.2863193154335022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029171774709678022, + "compression/movement_sparsity/importance_threshold": -0.4880333788934188, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16939735412597656, + "epoch": 4.26, + "learning_rate": 3.873052880809493e-07, + "loss": 0.1796, + "step": 11790, + "task_loss": 0.33054405450820923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02923941340294343, + "compression/movement_sparsity/importance_threshold": -0.4871120399280983, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18896028399467468, + "epoch": 4.26, + "learning_rate": 3.8581086582832967e-07, + "loss": 0.1792, + "step": 11800, + "task_loss": 0.5677767992019653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029306966914313084, + "compression/movement_sparsity/importance_threshold": -0.48619186126608815, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1795285940170288, + "epoch": 4.27, + "learning_rate": 3.8431864285773964e-07, + "loss": 0.1712, + "step": 11810, + "task_loss": 0.7608554363250732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02937443529745841, + "compression/movement_sparsity/importance_threshold": -0.4852728421763042, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1296456754207611, + "epoch": 4.27, + "learning_rate": 3.828286245125163e-07, + "loss": 0.1655, + "step": 11820, + "task_loss": 0.385231614112854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029441818606050794, + "compression/movement_sparsity/importance_threshold": -0.48435498192766263, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16423696279525757, + "epoch": 4.28, + "learning_rate": 3.8134081612810097e-07, + "loss": 0.167, + "step": 11830, + "task_loss": 0.2643640637397766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02950911689376165, + "compression/movement_sparsity/importance_threshold": -0.48343827978907966, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1867581307888031, + "epoch": 4.28, + "learning_rate": 3.7985522303202277e-07, + "loss": 0.1719, + "step": 11840, + "task_loss": 0.4425698518753052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02957633021426241, + "compression/movement_sparsity/importance_threshold": -0.482522735029471, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13380573689937592, + "epoch": 4.28, + "learning_rate": 3.7837185054387833e-07, + "loss": 0.1659, + "step": 11850, + "task_loss": 0.28523489832878113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02964345862122448, + "compression/movement_sparsity/importance_threshold": -0.48160834691775284, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1440255045890808, + "epoch": 4.29, + "learning_rate": 3.7689070397531163e-07, + "loss": 0.1602, + "step": 11860, + "task_loss": 0.47079092264175415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029710502168319258, + "compression/movement_sparsity/importance_threshold": -0.4806951147228413, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17543549835681915, + "epoch": 4.29, + "learning_rate": 3.7541178862999714e-07, + "loss": 0.1781, + "step": 11870, + "task_loss": 0.6562625169754028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029777460909218165, + "compression/movement_sparsity/importance_threshold": -0.47978303771365244, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13487908244132996, + "epoch": 4.29, + "learning_rate": 3.739351098036195e-07, + "loss": 0.1778, + "step": 11880, + "task_loss": 0.23533663153648376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0298443348975926, + "compression/movement_sparsity/importance_threshold": -0.4788721151591024, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15928372740745544, + "epoch": 4.3, + "learning_rate": 3.724606727838551e-07, + "loss": 0.1712, + "step": 11890, + "task_loss": 0.4500586986541748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029911124187113988, + "compression/movement_sparsity/importance_threshold": -0.477962346328107, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16884973645210266, + "epoch": 4.3, + "learning_rate": 3.70988482850352e-07, + "loss": 0.1865, + "step": 11900, + "task_loss": 0.22457614541053772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029977828831453723, + "compression/movement_sparsity/importance_threshold": -0.4770537304895827, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1822618544101715, + "epoch": 4.3, + "learning_rate": 3.695185452747127e-07, + "loss": 0.1776, + "step": 11910, + "task_loss": 0.32499706745147705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030044448884283235, + "compression/movement_sparsity/importance_threshold": -0.4761462669124452, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15013720095157623, + "epoch": 4.31, + "learning_rate": 3.680508653204748e-07, + "loss": 0.1711, + "step": 11920, + "task_loss": 0.37257882952690125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03011098439927392, + "compression/movement_sparsity/importance_threshold": -0.47523995486561077, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13815215229988098, + "epoch": 4.31, + "learning_rate": 3.665854482430907e-07, + "loss": 0.1778, + "step": 11930, + "task_loss": 0.34098368883132935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030177435430097186, + "compression/movement_sparsity/importance_threshold": -0.47433479361799546, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13309718668460846, + "epoch": 4.32, + "learning_rate": 3.65122299289911e-07, + "loss": 0.162, + "step": 11940, + "task_loss": 0.17212031781673431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03024380203042446, + "compression/movement_sparsity/importance_threshold": -0.4734307824385152, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17650893330574036, + "epoch": 4.32, + "learning_rate": 3.636614237001637e-07, + "loss": 0.1833, + "step": 11950, + "task_loss": 0.4818665087223053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03031008425392714, + "compression/movement_sparsity/importance_threshold": -0.47252792059608617, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1562044322490692, + "epoch": 4.32, + "learning_rate": 3.6220282670493706e-07, + "loss": 0.1765, + "step": 11960, + "task_loss": 0.4606776535511017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03037628215427664, + "compression/movement_sparsity/importance_threshold": -0.47162620735962446, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16945087909698486, + "epoch": 4.33, + "learning_rate": 3.607465135271603e-07, + "loss": 0.1905, + "step": 11970, + "task_loss": 0.40968573093414307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03044239578514437, + "compression/movement_sparsity/importance_threshold": -0.4707256419980461, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17481261491775513, + "epoch": 4.33, + "learning_rate": 3.5929248938158396e-07, + "loss": 0.1733, + "step": 11980, + "task_loss": 0.25828778743743896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030508425200201736, + "compression/movement_sparsity/importance_threshold": -0.46982622378026717, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21180686354637146, + "epoch": 4.33, + "learning_rate": 3.578407594747624e-07, + "loss": 0.1729, + "step": 11990, + "task_loss": 0.4141823351383209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03057437045312016, + "compression/movement_sparsity/importance_threshold": -0.46892795197520365, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19437111914157867, + "epoch": 4.34, + "learning_rate": 3.5639132900503533e-07, + "loss": 0.1742, + "step": 12000, + "task_loss": 0.39397311210632324 + }, + { + "epoch": 4.34, + "eval_exact_match": 83.56669820245979, + "eval_f1": 90.00705474053464, + "step": 12000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030640231597571045, + "compression/movement_sparsity/importance_threshold": -0.4680308258517717, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15808720886707306, + "epoch": 4.34, + "learning_rate": 3.549442031625084e-07, + "loss": 0.1823, + "step": 12010, + "task_loss": 0.31429338455200195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030706008687225797, + "compression/movement_sparsity/importance_threshold": -0.4671348446788875, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13028618693351746, + "epoch": 4.34, + "learning_rate": 3.534993871290338e-07, + "loss": 0.1702, + "step": 12020, + "task_loss": 0.35157257318496704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030771701775755823, + "compression/movement_sparsity/importance_threshold": -0.46624000772546703, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1625824123620987, + "epoch": 4.35, + "learning_rate": 3.520568860781944e-07, + "loss": 0.1868, + "step": 12030, + "task_loss": 0.38068217039108276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030837310916832554, + "compression/movement_sparsity/importance_threshold": -0.46534631426042616, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1509367823600769, + "epoch": 4.35, + "learning_rate": 3.5061670517528294e-07, + "loss": 0.1763, + "step": 12040, + "task_loss": 0.3321918249130249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030902836164127395, + "compression/movement_sparsity/importance_threshold": -0.46445376355268103, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16750577092170715, + "epoch": 4.35, + "learning_rate": 3.491788495772836e-07, + "loss": 0.1792, + "step": 12050, + "task_loss": 0.8832313418388367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03096827757131175, + "compression/movement_sparsity/importance_threshold": -0.46356235487114783, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15269014239311218, + "epoch": 4.36, + "learning_rate": 3.477433244328553e-07, + "loss": 0.1686, + "step": 12060, + "task_loss": 0.37979966402053833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.031033635192057027, + "compression/movement_sparsity/importance_threshold": -0.46267208748474264, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16275674104690552, + "epoch": 4.36, + "learning_rate": 3.4631013488231075e-07, + "loss": 0.1806, + "step": 12070, + "task_loss": 0.38607725501060486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03109890908003464, + "compression/movement_sparsity/importance_threshold": -0.46178296066238145, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.24525508284568787, + "epoch": 4.37, + "learning_rate": 3.448792860576004e-07, + "loss": 0.1838, + "step": 12080, + "task_loss": 0.5830104947090149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.031164099288916003, + "compression/movement_sparsity/importance_threshold": -0.4608949736729803, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15277394652366638, + "epoch": 4.37, + "learning_rate": 3.434507830822934e-07, + "loss": 0.1763, + "step": 12090, + "task_loss": 0.2695538401603699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.031229205872372517, + "compression/movement_sparsity/importance_threshold": -0.46000812578545536, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16866612434387207, + "epoch": 4.37, + "learning_rate": 3.420246310715572e-07, + "loss": 0.1634, + "step": 12100, + "task_loss": 0.5227435827255249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03129422888407559, + "compression/movement_sparsity/importance_threshold": -0.4591224162687228, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18312391638755798, + "epoch": 4.38, + "learning_rate": 3.4060083513214257e-07, + "loss": 0.1721, + "step": 12110, + "task_loss": 0.5585439801216125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03135916837769666, + "compression/movement_sparsity/importance_threshold": -0.4582378443916983, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15855933725833893, + "epoch": 4.38, + "learning_rate": 3.39179400362363e-07, + "loss": 0.1687, + "step": 12120, + "task_loss": 0.47586819529533386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03142402440690711, + "compression/movement_sparsity/importance_threshold": -0.45735440942329825, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17082326114177704, + "epoch": 4.38, + "learning_rate": 3.377603318520779e-07, + "loss": 0.1633, + "step": 12130, + "task_loss": 0.49885010719299316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03148879702537836, + "compression/movement_sparsity/importance_threshold": -0.45647211063243853, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1388573944568634, + "epoch": 4.39, + "learning_rate": 3.3634363468267177e-07, + "loss": 0.1781, + "step": 12140, + "task_loss": 0.5175285339355469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.031553486286781826, + "compression/movement_sparsity/importance_threshold": -0.4555909472880353, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1499364674091339, + "epoch": 4.39, + "learning_rate": 3.349293139270398e-07, + "loss": 0.1753, + "step": 12150, + "task_loss": 0.13086864352226257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03161809224478891, + "compression/movement_sparsity/importance_threshold": -0.45471091865900465, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2038741409778595, + "epoch": 4.39, + "learning_rate": 3.335173746495672e-07, + "loss": 0.1774, + "step": 12160, + "task_loss": 0.616294801235199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03168261495307103, + "compression/movement_sparsity/importance_threshold": -0.4538320240142626, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16956669092178345, + "epoch": 4.4, + "learning_rate": 3.3210782190611054e-07, + "loss": 0.1763, + "step": 12170, + "task_loss": 0.5146920680999756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03174705446529958, + "compression/movement_sparsity/importance_threshold": -0.45295426262272526, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1898520290851593, + "epoch": 4.4, + "learning_rate": 3.3070066074398226e-07, + "loss": 0.1798, + "step": 12180, + "task_loss": 0.5947543382644653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03181141083514599, + "compression/movement_sparsity/importance_threshold": -0.45207763375330867, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13102659583091736, + "epoch": 4.41, + "learning_rate": 3.2929589620192975e-07, + "loss": 0.1581, + "step": 12190, + "task_loss": 0.2658257484436035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.031875684116281666, + "compression/movement_sparsity/importance_threshold": -0.45120213667492887, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18428611755371094, + "epoch": 4.41, + "learning_rate": 3.278935333101196e-07, + "loss": 0.1835, + "step": 12200, + "task_loss": 0.45930129289627075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03193987436237801, + "compression/movement_sparsity/importance_threshold": -0.45032777065650204, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15300177037715912, + "epoch": 4.41, + "learning_rate": 3.264935770901183e-07, + "loss": 0.183, + "step": 12210, + "task_loss": 0.296694278717041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.032003981627106444, + "compression/movement_sparsity/importance_threshold": -0.4494545349669441, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17390677332878113, + "epoch": 4.42, + "learning_rate": 3.2509603255487394e-07, + "loss": 0.178, + "step": 12220, + "task_loss": 0.5502451062202454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03206800596413837, + "compression/movement_sparsity/importance_threshold": -0.44858242887517114, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18281325697898865, + "epoch": 4.42, + "learning_rate": 3.237009047086997e-07, + "loss": 0.169, + "step": 12230, + "task_loss": 0.4404161274433136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0321319474271452, + "compression/movement_sparsity/importance_threshold": -0.4477114516500993, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1602400243282318, + "epoch": 4.42, + "learning_rate": 3.2230819854725465e-07, + "loss": 0.1723, + "step": 12240, + "task_loss": 0.5139293670654297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03219580606979835, + "compression/movement_sparsity/importance_threshold": -0.44684160256064465, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15721780061721802, + "epoch": 4.43, + "learning_rate": 3.2091791905752673e-07, + "loss": 0.1617, + "step": 12250, + "task_loss": 0.4229050874710083 + }, + { + "epoch": 4.43, + "eval_exact_match": 83.62346263008514, + "eval_f1": 89.97990512757178, + "step": 12250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03225958194576923, + "compression/movement_sparsity/importance_threshold": -0.4459728808757232, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15479019284248352, + "epoch": 4.43, + "learning_rate": 3.1953007121781425e-07, + "loss": 0.1746, + "step": 12260, + "task_loss": 0.5893458127975464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03232327510872924, + "compression/movement_sparsity/importance_threshold": -0.445105285864251, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15340088307857513, + "epoch": 4.43, + "learning_rate": 3.181446599977078e-07, + "loss": 0.1625, + "step": 12270, + "task_loss": 0.6467044353485107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0323868856123498, + "compression/movement_sparsity/importance_threshold": -0.4442388167951442, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1622926890850067, + "epoch": 4.44, + "learning_rate": 3.167616903580738e-07, + "loss": 0.1754, + "step": 12280, + "task_loss": 0.4219571352005005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03245041351030232, + "compression/movement_sparsity/importance_threshold": -0.4433734729373189, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16757610440254211, + "epoch": 4.44, + "learning_rate": 3.1538116725103506e-07, + "loss": 0.1793, + "step": 12290, + "task_loss": 0.3955652713775635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0325138588562582, + "compression/movement_sparsity/importance_threshold": -0.442509253559691, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19136472046375275, + "epoch": 4.45, + "learning_rate": 3.1400309561995473e-07, + "loss": 0.1777, + "step": 12300, + "task_loss": 0.5077202320098877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.032577221703888874, + "compression/movement_sparsity/importance_threshold": -0.4416461579311766, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17287388443946838, + "epoch": 4.45, + "learning_rate": 3.126274803994169e-07, + "loss": 0.1769, + "step": 12310, + "task_loss": 0.2892530560493469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03264050210686572, + "compression/movement_sparsity/importance_threshold": -0.440784185320692, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1708020567893982, + "epoch": 4.45, + "learning_rate": 3.1125432651521034e-07, + "loss": 0.178, + "step": 12320, + "task_loss": 0.5046877264976501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03270370011886018, + "compression/movement_sparsity/importance_threshold": -0.439923334997153, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17857205867767334, + "epoch": 4.46, + "learning_rate": 3.098836388843105e-07, + "loss": 0.1756, + "step": 12330, + "task_loss": 0.47654569149017334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03276681579354365, + "compression/movement_sparsity/importance_threshold": -0.43906360622947577, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15070676803588867, + "epoch": 4.46, + "learning_rate": 3.085154224148605e-07, + "loss": 0.1623, + "step": 12340, + "task_loss": 0.6382228136062622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03282984918458754, + "compression/movement_sparsity/importance_threshold": -0.43820499828657633, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14683285355567932, + "epoch": 4.46, + "learning_rate": 3.071496820061561e-07, + "loss": 0.1704, + "step": 12350, + "task_loss": 0.3013722598552704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03289280034566327, + "compression/movement_sparsity/importance_threshold": -0.43734751043737075, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11898720264434814, + "epoch": 4.47, + "learning_rate": 3.057864225486262e-07, + "loss": 0.1761, + "step": 12360, + "task_loss": 0.18860679864883423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.032955669330442236, + "compression/movement_sparsity/importance_threshold": -0.4364911419507752, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17226265370845795, + "epoch": 4.47, + "learning_rate": 3.044256489238159e-07, + "loss": 0.1697, + "step": 12370, + "task_loss": 0.2961030602455139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03301845619259586, + "compression/movement_sparsity/importance_threshold": -0.43563589209570563, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1485224962234497, + "epoch": 4.47, + "learning_rate": 3.030673660043698e-07, + "loss": 0.1871, + "step": 12380, + "task_loss": 0.31120193004608154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03308116098579554, + "compression/movement_sparsity/importance_threshold": -0.43478176014107817, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1510060578584671, + "epoch": 4.48, + "learning_rate": 3.018470449500072e-07, + "loss": 0.178, + "step": 12390, + "task_loss": 0.49028700590133667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0331437837637127, + "compression/movement_sparsity/importance_threshold": -0.433928745355809, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16419251263141632, + "epoch": 4.48, + "learning_rate": 3.004935077629299e-07, + "loss": 0.1782, + "step": 12400, + "task_loss": 0.3531630039215088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.033206324580018744, + "compression/movement_sparsity/importance_threshold": -0.43307684700881394, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18890751898288727, + "epoch": 4.49, + "learning_rate": 2.991424753613858e-07, + "loss": 0.1893, + "step": 12410, + "task_loss": 0.4802757799625397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.033268783488385086, + "compression/movement_sparsity/importance_threshold": -0.4322260643690091, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14421889185905457, + "epoch": 4.49, + "learning_rate": 2.97793952583138e-07, + "loss": 0.163, + "step": 12420, + "task_loss": 0.6870753765106201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03333116054248314, + "compression/movement_sparsity/importance_threshold": -0.4313763967053107, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14144985377788544, + "epoch": 4.49, + "learning_rate": 2.9644794425696316e-07, + "loss": 0.166, + "step": 12430, + "task_loss": 0.2926397919654846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0333934557959843, + "compression/movement_sparsity/importance_threshold": -0.43052784328663474, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14997738599777222, + "epoch": 4.5, + "learning_rate": 2.9510445520263315e-07, + "loss": 0.1834, + "step": 12440, + "task_loss": 0.2823546528816223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.033455669302560005, + "compression/movement_sparsity/importance_threshold": -0.4296804033818971, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1645713895559311, + "epoch": 4.5, + "learning_rate": 2.937634902309001e-07, + "loss": 0.1829, + "step": 12450, + "task_loss": 0.41032591462135315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03351780111588164, + "compression/movement_sparsity/importance_threshold": -0.42883407626001413, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14776745438575745, + "epoch": 4.5, + "learning_rate": 2.9242505414347683e-07, + "loss": 0.1743, + "step": 12460, + "task_loss": 0.6065025925636292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03357985128962062, + "compression/movement_sparsity/importance_threshold": -0.4279888611899018, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17194849252700806, + "epoch": 4.51, + "learning_rate": 2.910891517330215e-07, + "loss": 0.1771, + "step": 12470, + "task_loss": 0.49688899517059326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03364181987744837, + "compression/movement_sparsity/importance_threshold": -0.42714475744047614, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17045895755290985, + "epoch": 4.51, + "learning_rate": 2.897557877831196e-07, + "loss": 0.1693, + "step": 12480, + "task_loss": 0.3933444917201996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03370370693303629, + "compression/movement_sparsity/importance_threshold": -0.42630176428065325, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1370893120765686, + "epoch": 4.51, + "learning_rate": 2.884249670682668e-07, + "loss": 0.1849, + "step": 12490, + "task_loss": 0.28957247734069824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03376551251005577, + "compression/movement_sparsity/importance_threshold": -0.4254598809793493, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16283123195171356, + "epoch": 4.52, + "learning_rate": 2.870966943538522e-07, + "loss": 0.167, + "step": 12500, + "task_loss": 0.2896598279476166 + }, + { + "epoch": 4.52, + "eval_exact_match": 83.43424787133397, + "eval_f1": 89.91171539061436, + "step": 12500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03382723666217826, + "compression/movement_sparsity/importance_threshold": -0.42461910680548015, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13904690742492676, + "epoch": 4.52, + "learning_rate": 2.857709743961404e-07, + "loss": 0.1662, + "step": 12510, + "task_loss": 0.2602207660675049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03388887944307515, + "compression/movement_sparsity/importance_threshold": -0.4237794410279619, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15818578004837036, + "epoch": 4.52, + "learning_rate": 2.84447811942256e-07, + "loss": 0.1918, + "step": 12520, + "task_loss": 0.31633198261260986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03395044090641785, + "compression/movement_sparsity/importance_threshold": -0.42294088291571075, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17294219136238098, + "epoch": 4.53, + "learning_rate": 2.8312721173016476e-07, + "loss": 0.1681, + "step": 12530, + "task_loss": 0.3709946870803833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03401192110587777, + "compression/movement_sparsity/importance_threshold": -0.4221034317376427, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12054181098937988, + "epoch": 4.53, + "learning_rate": 2.818091784886585e-07, + "loss": 0.168, + "step": 12540, + "task_loss": 0.2598145604133606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.034073320095126335, + "compression/movement_sparsity/importance_threshold": -0.42126708676267377, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15481901168823242, + "epoch": 4.54, + "learning_rate": 2.80493716937337e-07, + "loss": 0.1767, + "step": 12550, + "task_loss": 0.38484764099121094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.034134637927834936, + "compression/movement_sparsity/importance_threshold": -0.42043184725972005, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1437249779701233, + "epoch": 4.54, + "learning_rate": 2.791808317865907e-07, + "loss": 0.1773, + "step": 12560, + "task_loss": 0.3040648102760315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03419587465767499, + "compression/movement_sparsity/importance_threshold": -0.41959771249769773, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18909630179405212, + "epoch": 4.54, + "learning_rate": 2.778705277375857e-07, + "loss": 0.1829, + "step": 12570, + "task_loss": 0.3867703080177307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03425703033831792, + "compression/movement_sparsity/importance_threshold": -0.41876468174552267, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1678750216960907, + "epoch": 4.55, + "learning_rate": 2.765628094822443e-07, + "loss": 0.1736, + "step": 12580, + "task_loss": 0.4944628179073334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.034318105023435116, + "compression/movement_sparsity/importance_threshold": -0.41793275427211113, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16761477291584015, + "epoch": 4.55, + "learning_rate": 2.7525768170323084e-07, + "loss": 0.1873, + "step": 12590, + "task_loss": 0.2373383790254593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03437909876669801, + "compression/movement_sparsity/importance_threshold": -0.4171019293463789, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15242645144462585, + "epoch": 4.55, + "learning_rate": 2.7395514907393304e-07, + "loss": 0.1712, + "step": 12600, + "task_loss": 0.2393788993358612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.034440011621777976, + "compression/movement_sparsity/importance_threshold": -0.4162722062372427, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19179469347000122, + "epoch": 4.56, + "learning_rate": 2.7265521625844623e-07, + "loss": 0.1749, + "step": 12610, + "task_loss": 0.3033488392829895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03450084364234648, + "compression/movement_sparsity/importance_threshold": -0.41544358421361766, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17567074298858643, + "epoch": 4.56, + "learning_rate": 2.7135788791155645e-07, + "loss": 0.1815, + "step": 12620, + "task_loss": 0.608439564704895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.034561594882074886, + "compression/movement_sparsity/importance_threshold": -0.41461606254442046, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13859786093235016, + "epoch": 4.56, + "learning_rate": 2.7006316867872303e-07, + "loss": 0.1801, + "step": 12630, + "task_loss": 0.2615768313407898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.034622265394634616, + "compression/movement_sparsity/importance_threshold": -0.41378964049856715, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1449252963066101, + "epoch": 4.57, + "learning_rate": 2.6877106319606344e-07, + "loss": 0.1744, + "step": 12640, + "task_loss": 0.35204145312309265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0346828552336971, + "compression/movement_sparsity/importance_threshold": -0.41296431734497346, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17655514180660248, + "epoch": 4.57, + "learning_rate": 2.6748157609033507e-07, + "loss": 0.1744, + "step": 12650, + "task_loss": 0.3563224971294403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03474336445293374, + "compression/movement_sparsity/importance_threshold": -0.4121400923525557, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19780133664608002, + "epoch": 4.58, + "learning_rate": 2.661947119789202e-07, + "loss": 0.1878, + "step": 12660, + "task_loss": 0.5263267755508423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03480379310601593, + "compression/movement_sparsity/importance_threshold": -0.41131696479023, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1600715070962906, + "epoch": 4.58, + "learning_rate": 2.649104754698085e-07, + "loss": 0.1757, + "step": 12670, + "task_loss": 0.3594798445701599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03486414124661509, + "compression/movement_sparsity/importance_threshold": -0.4104949339269123, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18169142305850983, + "epoch": 4.58, + "learning_rate": 2.636288711615801e-07, + "loss": 0.1801, + "step": 12680, + "task_loss": 0.2691894769668579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03492440892840263, + "compression/movement_sparsity/importance_threshold": -0.4096739990315188, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13344717025756836, + "epoch": 4.59, + "learning_rate": 2.623499036433909e-07, + "loss": 0.1725, + "step": 12690, + "task_loss": 0.42940986156463623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03498459620504996, + "compression/movement_sparsity/importance_threshold": -0.4088541593729654, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1338200718164444, + "epoch": 4.59, + "learning_rate": 2.6107357749495396e-07, + "loss": 0.1853, + "step": 12700, + "task_loss": 0.0940115749835968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035044703130228505, + "compression/movement_sparsity/importance_threshold": -0.4080354142201682, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17829687893390656, + "epoch": 4.59, + "learning_rate": 2.5979989728652486e-07, + "loss": 0.1591, + "step": 12710, + "task_loss": 0.4869588017463684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03510472975760966, + "compression/movement_sparsity/importance_threshold": -0.4072177628420433, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19079387187957764, + "epoch": 4.6, + "learning_rate": 2.5852886757888417e-07, + "loss": 0.1801, + "step": 12720, + "task_loss": 0.3848039507865906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035164676140864846, + "compression/movement_sparsity/importance_threshold": -0.40640120450750666, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16148385405540466, + "epoch": 4.6, + "learning_rate": 2.57260492923322e-07, + "loss": 0.1754, + "step": 12730, + "task_loss": 0.41788214445114136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035224542333665454, + "compression/movement_sparsity/importance_threshold": -0.40558573848547463, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18705248832702637, + "epoch": 4.6, + "learning_rate": 2.5599477786162115e-07, + "loss": 0.1689, + "step": 12740, + "task_loss": 0.4458879232406616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035284328389682915, + "compression/movement_sparsity/importance_threshold": -0.404771364044863, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17292597889900208, + "epoch": 4.61, + "learning_rate": 2.547317269260405e-07, + "loss": 0.1806, + "step": 12750, + "task_loss": 0.3828756511211395 + }, + { + "epoch": 4.61, + "eval_exact_match": 83.59508041627247, + "eval_f1": 89.98942260118159, + "step": 12750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03534403436258863, + "compression/movement_sparsity/importance_threshold": -0.403958080454588, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12280981242656708, + "epoch": 4.61, + "learning_rate": 2.534713446393002e-07, + "loss": 0.1769, + "step": 12760, + "task_loss": 0.34313008189201355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035403660306054026, + "compression/movement_sparsity/importance_threshold": -0.40314588698356546, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1836218684911728, + "epoch": 4.62, + "learning_rate": 2.522136355145632e-07, + "loss": 0.1796, + "step": 12770, + "task_loss": 0.5304206013679504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035463206273750485, + "compression/movement_sparsity/importance_threshold": -0.40233478290071184, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1639278084039688, + "epoch": 4.62, + "learning_rate": 2.5095860405542167e-07, + "loss": 0.1805, + "step": 12780, + "task_loss": 0.4174896776676178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03552267231934944, + "compression/movement_sparsity/importance_threshold": -0.4015247674749428, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1624828726053238, + "epoch": 4.62, + "learning_rate": 2.497062547558793e-07, + "loss": 0.1804, + "step": 12790, + "task_loss": 0.31818675994873047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03558205849652229, + "compression/movement_sparsity/importance_threshold": -0.40071583997517474, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16943714022636414, + "epoch": 4.63, + "learning_rate": 2.4845659210033477e-07, + "loss": 0.1765, + "step": 12800, + "task_loss": 0.44528499245643616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03564136485894044, + "compression/movement_sparsity/importance_threshold": -0.39990799967032353, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17110618948936462, + "epoch": 4.63, + "learning_rate": 2.4720962056356776e-07, + "loss": 0.177, + "step": 12810, + "task_loss": 0.3735952079296112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03570059146027532, + "compression/movement_sparsity/importance_threshold": -0.39910124582930534, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14698469638824463, + "epoch": 4.63, + "learning_rate": 2.4596534461072025e-07, + "loss": 0.1644, + "step": 12820, + "task_loss": 0.2619709074497223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03575973835419833, + "compression/movement_sparsity/importance_threshold": -0.3982955777210361, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14263629913330078, + "epoch": 4.64, + "learning_rate": 2.4472376869728286e-07, + "loss": 0.1668, + "step": 12830, + "task_loss": 0.20229429006576538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03581880559438087, + "compression/movement_sparsity/importance_threshold": -0.39749099461443205, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1636548638343811, + "epoch": 4.64, + "learning_rate": 2.4348489726907773e-07, + "loss": 0.1836, + "step": 12840, + "task_loss": 0.6526674628257751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03587779323449438, + "compression/movement_sparsity/importance_threshold": -0.396687495778409, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17002159357070923, + "epoch": 4.64, + "learning_rate": 2.422487347622425e-07, + "loss": 0.1933, + "step": 12850, + "task_loss": 0.3556768298149109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035936701328210245, + "compression/movement_sparsity/importance_threshold": -0.3958850804818833, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13383722305297852, + "epoch": 4.65, + "learning_rate": 2.410152856032154e-07, + "loss": 0.1743, + "step": 12860, + "task_loss": 0.4999997615814209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.035995529929199875, + "compression/movement_sparsity/importance_threshold": -0.39508374799377105, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16238951683044434, + "epoch": 4.65, + "learning_rate": 2.397845542087177e-07, + "loss": 0.1743, + "step": 12870, + "task_loss": 0.3286881744861603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0360542790911347, + "compression/movement_sparsity/importance_threshold": -0.3942834975829879, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1793985217809677, + "epoch": 4.65, + "learning_rate": 2.385565449857401e-07, + "loss": 0.1743, + "step": 12880, + "task_loss": 0.7459797263145447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.036112948867686116, + "compression/movement_sparsity/importance_threshold": -0.39348432851845033, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18460488319396973, + "epoch": 4.66, + "learning_rate": 2.3733126233152456e-07, + "loss": 0.1882, + "step": 12890, + "task_loss": 0.3585508465766907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03617153931252552, + "compression/movement_sparsity/importance_threshold": -0.39268624006907443, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15785962343215942, + "epoch": 4.66, + "learning_rate": 2.3610871063355065e-07, + "loss": 0.187, + "step": 12900, + "task_loss": 0.519625723361969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03623005047932436, + "compression/movement_sparsity/importance_threshold": -0.3918892315037759, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14596977829933167, + "epoch": 4.67, + "learning_rate": 2.3488889426951907e-07, + "loss": 0.1701, + "step": 12910, + "task_loss": 0.41778382658958435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03628848242175402, + "compression/movement_sparsity/importance_threshold": -0.3910933020914711, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17587855458259583, + "epoch": 4.67, + "learning_rate": 2.336718176073349e-07, + "loss": 0.1728, + "step": 12920, + "task_loss": 0.8074888586997986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03634683519348591, + "compression/movement_sparsity/importance_threshold": -0.3902984511010759, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12496951222419739, + "epoch": 4.67, + "learning_rate": 2.32457485005094e-07, + "loss": 0.1697, + "step": 12930, + "task_loss": 0.2537425756454468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03640510884819145, + "compression/movement_sparsity/importance_threshold": -0.3895046778015065, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14235326647758484, + "epoch": 4.68, + "learning_rate": 2.3124590081106553e-07, + "loss": 0.1806, + "step": 12940, + "task_loss": 0.3784465789794922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03646330343954205, + "compression/movement_sparsity/importance_threshold": -0.38871198146167896, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14437982439994812, + "epoch": 4.68, + "learning_rate": 2.300370693636775e-07, + "loss": 0.1776, + "step": 12950, + "task_loss": 0.41959238052368164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.036521419021209124, + "compression/movement_sparsity/importance_threshold": -0.38792036135050934, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1669291853904724, + "epoch": 4.68, + "learning_rate": 2.2883099499150116e-07, + "loss": 0.1741, + "step": 12960, + "task_loss": 0.8136731386184692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.036579455646864066, + "compression/movement_sparsity/importance_threshold": -0.38712981673691366, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13533678650856018, + "epoch": 4.69, + "learning_rate": 2.276276820132349e-07, + "loss": 0.1852, + "step": 12970, + "task_loss": 0.2773071527481079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0366374133701783, + "compression/movement_sparsity/importance_threshold": -0.38634034688980806, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17692357301712036, + "epoch": 4.69, + "learning_rate": 2.264271347376895e-07, + "loss": 0.1635, + "step": 12980, + "task_loss": 0.3910565972328186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03669529224482323, + "compression/movement_sparsity/importance_threshold": -0.38555195107810863, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18977247178554535, + "epoch": 4.69, + "learning_rate": 2.252293574637717e-07, + "loss": 0.1751, + "step": 12990, + "task_loss": 0.4595882296562195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03675309232447028, + "compression/movement_sparsity/importance_threshold": -0.38476462857073124, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1279372274875641, + "epoch": 4.7, + "learning_rate": 2.2403435448047014e-07, + "loss": 0.1708, + "step": 13000, + "task_loss": 0.4735584557056427 + }, + { + "epoch": 4.7, + "eval_exact_match": 83.4720908230842, + "eval_f1": 89.90774706672684, + "step": 13000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03681081366279085, + "compression/movement_sparsity/importance_threshold": -0.3839783786365921, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15371698141098022, + "epoch": 4.7, + "learning_rate": 2.228421300668386e-07, + "loss": 0.1755, + "step": 13010, + "task_loss": 0.5671071410179138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03686845631345635, + "compression/movement_sparsity/importance_threshold": -0.38319320054460726, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14245186746120453, + "epoch": 4.71, + "learning_rate": 2.2165268849198205e-07, + "loss": 0.1774, + "step": 13020, + "task_loss": 0.5267888307571411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03692602033013819, + "compression/movement_sparsity/importance_threshold": -0.3824090935636929, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1692107617855072, + "epoch": 4.71, + "learning_rate": 2.2046603401504082e-07, + "loss": 0.1713, + "step": 13030, + "task_loss": 0.4950706362724304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.036983505766507795, + "compression/movement_sparsity/importance_threshold": -0.38162605696276486, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1489468812942505, + "epoch": 4.71, + "learning_rate": 2.192821708851741e-07, + "loss": 0.1798, + "step": 13040, + "task_loss": 0.28536754846572876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03704091267623654, + "compression/movement_sparsity/importance_threshold": -0.3808440900107394, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1268996000289917, + "epoch": 4.72, + "learning_rate": 2.181011033415473e-07, + "loss": 0.165, + "step": 13050, + "task_loss": 0.2892007827758789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03709824111299588, + "compression/movement_sparsity/importance_threshold": -0.3800631919765325, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1905529797077179, + "epoch": 4.72, + "learning_rate": 2.1692283561331414e-07, + "loss": 0.1896, + "step": 13060, + "task_loss": 0.5020818710327148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03715549113045719, + "compression/movement_sparsity/importance_threshold": -0.3792833621290602, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11118581891059875, + "epoch": 4.72, + "learning_rate": 2.157473719196038e-07, + "loss": 0.1721, + "step": 13070, + "task_loss": 0.2518026828765869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03721266278229191, + "compression/movement_sparsity/importance_threshold": -0.3785045997372386, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14366507530212402, + "epoch": 4.73, + "learning_rate": 2.145747164695041e-07, + "loss": 0.1898, + "step": 13080, + "task_loss": 0.4140687584877014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03726975612217143, + "compression/movement_sparsity/importance_threshold": -0.37772690406998377, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13824304938316345, + "epoch": 4.73, + "learning_rate": 2.1340487346204762e-07, + "loss": 0.1739, + "step": 13090, + "task_loss": 0.3569027781486511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03732677120376716, + "compression/movement_sparsity/importance_threshold": -0.37695027439621187, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13565212488174438, + "epoch": 4.73, + "learning_rate": 2.1223784708619608e-07, + "loss": 0.1727, + "step": 13100, + "task_loss": 0.4583956003189087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037383708080750526, + "compression/movement_sparsity/importance_threshold": -0.3761747099848388, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15867725014686584, + "epoch": 4.74, + "learning_rate": 2.1107364152082507e-07, + "loss": 0.1761, + "step": 13110, + "task_loss": 0.2641053795814514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037440566806792926, + "compression/movement_sparsity/importance_threshold": -0.37540021010478075, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1482830047607422, + "epoch": 4.74, + "learning_rate": 2.099122609347097e-07, + "loss": 0.1914, + "step": 13120, + "task_loss": 0.7968321442604065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03749734743556577, + "compression/movement_sparsity/importance_threshold": -0.3746267740249538, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2047840803861618, + "epoch": 4.75, + "learning_rate": 2.0875370948650973e-07, + "loss": 0.1966, + "step": 13130, + "task_loss": 0.49344414472579956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03755405002074048, + "compression/movement_sparsity/importance_threshold": -0.3738544010142738, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1491725742816925, + "epoch": 4.75, + "learning_rate": 2.0759799132475365e-07, + "loss": 0.1798, + "step": 13140, + "task_loss": 0.4386056661605835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03761067461598845, + "compression/movement_sparsity/importance_threshold": -0.3730830903416571, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17659732699394226, + "epoch": 4.75, + "learning_rate": 2.0644511058782553e-07, + "loss": 0.182, + "step": 13150, + "task_loss": 0.32818663120269775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037667221274981115, + "compression/movement_sparsity/importance_threshold": -0.3723128412760196, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16805681586265564, + "epoch": 4.76, + "learning_rate": 2.0529507140394798e-07, + "loss": 0.1699, + "step": 13160, + "task_loss": 0.23117676377296448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03772369005138986, + "compression/movement_sparsity/importance_threshold": -0.37154365308627746, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15759176015853882, + "epoch": 4.76, + "learning_rate": 2.0414787789116994e-07, + "loss": 0.175, + "step": 13170, + "task_loss": 0.5075294375419617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03778008099888611, + "compression/movement_sparsity/importance_threshold": -0.3707755250413466, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16308321058750153, + "epoch": 4.76, + "learning_rate": 2.0300353415734927e-07, + "loss": 0.1767, + "step": 13180, + "task_loss": 0.3911818265914917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037836394171141256, + "compression/movement_sparsity/importance_threshold": -0.37000845641014346, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1465543508529663, + "epoch": 4.77, + "learning_rate": 2.0186204430014042e-07, + "loss": 0.1699, + "step": 13190, + "task_loss": 0.4668183922767639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03789262962182674, + "compression/movement_sparsity/importance_threshold": -0.36924244646158355, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16283223032951355, + "epoch": 4.77, + "learning_rate": 2.0072341240697842e-07, + "loss": 0.1668, + "step": 13200, + "task_loss": 0.5066724419593811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.037948787404613965, + "compression/movement_sparsity/importance_threshold": -0.3684774944645832, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2064799666404724, + "epoch": 4.77, + "learning_rate": 1.995876425550642e-07, + "loss": 0.1858, + "step": 13210, + "task_loss": 0.6612112522125244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03800486757317432, + "compression/movement_sparsity/importance_threshold": -0.3677135996880586, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1432875096797943, + "epoch": 4.78, + "learning_rate": 1.9845473881135112e-07, + "loss": 0.1654, + "step": 13220, + "task_loss": 0.21210426092147827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03806087018117923, + "compression/movement_sparsity/importance_threshold": -0.36695076140092575, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.10303863883018494, + "epoch": 4.78, + "learning_rate": 1.9732470523252832e-07, + "loss": 0.1533, + "step": 13230, + "task_loss": 0.14866429567337036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03811679528230011, + "compression/movement_sparsity/importance_threshold": -0.36618897887210056, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12185746431350708, + "epoch": 4.78, + "learning_rate": 1.9619754586500859e-07, + "loss": 0.1716, + "step": 13240, + "task_loss": 0.6474106311798096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03817264293020836, + "compression/movement_sparsity/importance_threshold": -0.3654282513704993, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18789950013160706, + "epoch": 4.79, + "learning_rate": 1.9507326474491258e-07, + "loss": 0.1776, + "step": 13250, + "task_loss": 0.5531033873558044 + }, + { + "epoch": 4.79, + "eval_exact_match": 83.50993377483444, + "eval_f1": 89.93742165413668, + "step": 13250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0382284131785754, + "compression/movement_sparsity/importance_threshold": -0.36466857816503795, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1531079113483429, + "epoch": 4.79, + "learning_rate": 1.93951865898054e-07, + "loss": 0.1685, + "step": 13260, + "task_loss": 0.2040596902370453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038284106081072626, + "compression/movement_sparsity/importance_threshold": -0.3639099585246327, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11716160178184509, + "epoch": 4.8, + "learning_rate": 1.9283335333992655e-07, + "loss": 0.1904, + "step": 13270, + "task_loss": 0.36657923460006714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03833972169137147, + "compression/movement_sparsity/importance_threshold": -0.36315239171819935, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1429518759250641, + "epoch": 4.8, + "learning_rate": 1.9171773107568766e-07, + "loss": 0.1871, + "step": 13280, + "task_loss": 0.605373740196228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03839526006314333, + "compression/movement_sparsity/importance_threshold": -0.36239587701465403, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14882807433605194, + "epoch": 4.8, + "learning_rate": 1.906050031001466e-07, + "loss": 0.1859, + "step": 13290, + "task_loss": 0.2876370847225189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03845072125005961, + "compression/movement_sparsity/importance_threshold": -0.3616404136829131, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15496216714382172, + "epoch": 4.81, + "learning_rate": 1.8949517339774746e-07, + "loss": 0.1685, + "step": 13300, + "task_loss": 0.49225640296936035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038506105305791745, + "compression/movement_sparsity/importance_threshold": -0.36088600099189216, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14122232794761658, + "epoch": 4.81, + "learning_rate": 1.8838824594255708e-07, + "loss": 0.1756, + "step": 13310, + "task_loss": 0.3297005295753479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03856141228401112, + "compression/movement_sparsity/importance_threshold": -0.36013263821050767, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.167262464761734, + "epoch": 4.81, + "learning_rate": 1.8728422469824977e-07, + "loss": 0.1751, + "step": 13320, + "task_loss": 0.6074428558349609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038616642238389155, + "compression/movement_sparsity/importance_threshold": -0.35938032460767566, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13553129136562347, + "epoch": 4.82, + "learning_rate": 1.8618311361809324e-07, + "loss": 0.1715, + "step": 13330, + "task_loss": 0.1803513467311859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03867179522259727, + "compression/movement_sparsity/importance_threshold": -0.35862905945231194, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14316369593143463, + "epoch": 4.82, + "learning_rate": 1.8508491664493465e-07, + "loss": 0.1755, + "step": 13340, + "task_loss": 0.2262798398733139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03872687129030685, + "compression/movement_sparsity/importance_threshold": -0.35787884201333287, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16255033016204834, + "epoch": 4.82, + "learning_rate": 1.839896377111859e-07, + "loss": 0.18, + "step": 13350, + "task_loss": 0.5666951537132263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03878187049518934, + "compression/movement_sparsity/importance_threshold": -0.35712967155965425, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16210150718688965, + "epoch": 4.83, + "learning_rate": 1.828972807388106e-07, + "loss": 0.1819, + "step": 13360, + "task_loss": 0.6401165723800659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038836792890916114, + "compression/movement_sparsity/importance_threshold": -0.35638154736019234, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1632276177406311, + "epoch": 4.83, + "learning_rate": 1.8180784963930928e-07, + "loss": 0.1725, + "step": 13370, + "task_loss": 0.457368403673172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03889163853115862, + "compression/movement_sparsity/importance_threshold": -0.35563446868386306, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1731918752193451, + "epoch": 4.84, + "learning_rate": 1.8072134831370512e-07, + "loss": 0.1622, + "step": 13380, + "task_loss": 0.352400541305542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03894640746958824, + "compression/movement_sparsity/importance_threshold": -0.35488843479958265, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18019556999206543, + "epoch": 4.84, + "learning_rate": 1.796377806525311e-07, + "loss": 0.1895, + "step": 13390, + "task_loss": 0.44412076473236084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03900109975987639, + "compression/movement_sparsity/importance_threshold": -0.35414344497626715, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11305814981460571, + "epoch": 4.84, + "learning_rate": 1.7855715053581445e-07, + "loss": 0.1765, + "step": 13400, + "task_loss": 0.5358411073684692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03905571545569449, + "compression/movement_sparsity/importance_threshold": -0.3533994984828325, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15808236598968506, + "epoch": 4.85, + "learning_rate": 1.7747946183306471e-07, + "loss": 0.1693, + "step": 13410, + "task_loss": 0.43483972549438477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03911025461071394, + "compression/movement_sparsity/importance_threshold": -0.35265659458819476, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15852884948253632, + "epoch": 4.85, + "learning_rate": 1.764047184032579e-07, + "loss": 0.184, + "step": 13420, + "task_loss": 0.6633070707321167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03916471727860616, + "compression/movement_sparsity/importance_threshold": -0.35191473256127015, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1737813502550125, + "epoch": 4.85, + "learning_rate": 1.7533292409482414e-07, + "loss": 0.1646, + "step": 13430, + "task_loss": 0.3524114489555359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039219103513042565, + "compression/movement_sparsity/importance_threshold": -0.35117391167097467, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15978504717350006, + "epoch": 4.86, + "learning_rate": 1.7426408274563343e-07, + "loss": 0.177, + "step": 13440, + "task_loss": 0.5557321906089783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03927341336769454, + "compression/movement_sparsity/importance_threshold": -0.35043413118622435, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1573951095342636, + "epoch": 4.86, + "learning_rate": 1.7319819818298166e-07, + "loss": 0.1765, + "step": 13450, + "task_loss": 0.5514523983001709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03932764689623351, + "compression/movement_sparsity/importance_threshold": -0.34969539037593544, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16307415068149567, + "epoch": 4.86, + "learning_rate": 1.7213527422357732e-07, + "loss": 0.1692, + "step": 13460, + "task_loss": 0.3270409107208252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039381804152330925, + "compression/movement_sparsity/importance_threshold": -0.3489576885090234, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1804170459508896, + "epoch": 4.87, + "learning_rate": 1.7107531467352697e-07, + "loss": 0.1793, + "step": 13470, + "task_loss": 0.3031477630138397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03943588518965812, + "compression/movement_sparsity/importance_threshold": -0.3482210248544052, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.176797017455101, + "epoch": 4.87, + "learning_rate": 1.70018323328323e-07, + "loss": 0.1721, + "step": 13480, + "task_loss": 0.528610348701477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03948989006188655, + "compression/movement_sparsity/importance_threshold": -0.34748539868099637, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14670194685459137, + "epoch": 4.88, + "learning_rate": 1.6896430397282914e-07, + "loss": 0.1938, + "step": 13490, + "task_loss": 0.41166549921035767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039543818822687635, + "compression/movement_sparsity/importance_threshold": -0.34675080925771296, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20626819133758545, + "epoch": 4.88, + "learning_rate": 1.679132603812663e-07, + "loss": 0.1797, + "step": 13500, + "task_loss": 0.8171520233154297 + }, + { + "epoch": 4.88, + "eval_exact_match": 83.61400189214758, + "eval_f1": 90.0307786000334, + "step": 13500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039597671525732764, + "compression/movement_sparsity/importance_threshold": -0.3460172558534711, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15772680938243866, + "epoch": 4.88, + "learning_rate": 1.6686519631720098e-07, + "loss": 0.1889, + "step": 13510, + "task_loss": 0.24707984924316406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03965144822469335, + "compression/movement_sparsity/importance_threshold": -0.34528473773718693, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17334116995334625, + "epoch": 4.89, + "learning_rate": 1.658201155335295e-07, + "loss": 0.1797, + "step": 13520, + "task_loss": 0.5400213003158569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03970514897324082, + "compression/movement_sparsity/importance_threshold": -0.3445532541777765, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1472284197807312, + "epoch": 4.89, + "learning_rate": 1.6477802177246646e-07, + "loss": 0.171, + "step": 13530, + "task_loss": 0.5143538117408752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03975877382504656, + "compression/movement_sparsity/importance_threshold": -0.3438228044441558, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17164795100688934, + "epoch": 4.89, + "learning_rate": 1.637389187655306e-07, + "loss": 0.1743, + "step": 13540, + "task_loss": 0.505851149559021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039812322833782, + "compression/movement_sparsity/importance_threshold": -0.343093387805241, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17246219515800476, + "epoch": 4.9, + "learning_rate": 1.627028102335305e-07, + "loss": 0.1844, + "step": 13550, + "task_loss": 0.43365949392318726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039865796053118525, + "compression/movement_sparsity/importance_threshold": -0.34236500352994836, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17335930466651917, + "epoch": 4.9, + "learning_rate": 1.616696998865531e-07, + "loss": 0.1656, + "step": 13560, + "task_loss": 0.40370243787765503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.039919193536727585, + "compression/movement_sparsity/importance_threshold": -0.34163765088719333, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16594436764717102, + "epoch": 4.9, + "learning_rate": 1.60639591423949e-07, + "loss": 0.1728, + "step": 13570, + "task_loss": 0.3566591441631317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03997251533828056, + "compression/movement_sparsity/importance_threshold": -0.3409113291458926, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1836821734905243, + "epoch": 4.91, + "learning_rate": 1.596124885343203e-07, + "loss": 0.1792, + "step": 13580, + "task_loss": 0.3985489308834076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04002576151144888, + "compression/movement_sparsity/importance_threshold": -0.3401860375749618, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22170782089233398, + "epoch": 4.91, + "learning_rate": 1.5858839489550546e-07, + "loss": 0.1794, + "step": 13590, + "task_loss": 0.5840495824813843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.040078932109903945, + "compression/movement_sparsity/importance_threshold": -0.33946177544331735, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18641307950019836, + "epoch": 4.92, + "learning_rate": 1.575673141745689e-07, + "loss": 0.176, + "step": 13600, + "task_loss": 0.5444910526275635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04013202718731717, + "compression/movement_sparsity/importance_threshold": -0.33873854201987497, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17240336537361145, + "epoch": 4.92, + "learning_rate": 1.5654925002778574e-07, + "loss": 0.1887, + "step": 13610, + "task_loss": 0.4279804825782776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04018504679735995, + "compression/movement_sparsity/importance_threshold": -0.3380163365735511, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16231310367584229, + "epoch": 4.92, + "learning_rate": 1.5553420610062905e-07, + "loss": 0.1801, + "step": 13620, + "task_loss": 0.3823118805885315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04023799099370372, + "compression/movement_sparsity/importance_threshold": -0.3372951583732614, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16496533155441284, + "epoch": 4.93, + "learning_rate": 1.54522186027758e-07, + "loss": 0.1814, + "step": 13630, + "task_loss": 0.3801088035106659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04029085983001987, + "compression/movement_sparsity/importance_threshold": -0.3365750066879223, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1552436500787735, + "epoch": 4.93, + "learning_rate": 1.5351319343300294e-07, + "loss": 0.1735, + "step": 13640, + "task_loss": 0.8039931058883667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.040343653359979824, + "compression/movement_sparsity/importance_threshold": -0.3358558807864498, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1522938311100006, + "epoch": 4.93, + "learning_rate": 1.5250723192935433e-07, + "loss": 0.1814, + "step": 13650, + "task_loss": 0.6675360798835754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04039637163725498, + "compression/movement_sparsity/importance_threshold": -0.3351377799377597, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17480003833770752, + "epoch": 4.94, + "learning_rate": 1.5150430511894862e-07, + "loss": 0.1833, + "step": 13660, + "task_loss": 0.4187951683998108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04044901471551676, + "compression/movement_sparsity/importance_threshold": -0.33442070341076835, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13274918496608734, + "epoch": 4.94, + "learning_rate": 1.5050441659305558e-07, + "loss": 0.1728, + "step": 13670, + "task_loss": 0.46560001373291016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.040501582648436577, + "compression/movement_sparsity/importance_threshold": -0.3337046504743917, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18608583509922028, + "epoch": 4.94, + "learning_rate": 1.495075699320658e-07, + "loss": 0.1855, + "step": 13680, + "task_loss": 0.5671523213386536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04055407548968582, + "compression/movement_sparsity/importance_threshold": -0.332989620397546, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19633609056472778, + "epoch": 4.95, + "learning_rate": 1.4851376870547705e-07, + "loss": 0.1814, + "step": 13690, + "task_loss": 0.31386256217956543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04060649329293593, + "compression/movement_sparsity/importance_threshold": -0.33227561244914694, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17011404037475586, + "epoch": 4.95, + "learning_rate": 1.475230164718827e-07, + "loss": 0.1851, + "step": 13700, + "task_loss": 0.3444925546646118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.040658836111858304, + "compression/movement_sparsity/importance_threshold": -0.3315626258981109, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17365840077400208, + "epoch": 4.95, + "learning_rate": 1.4653531677895748e-07, + "loss": 0.1759, + "step": 13710, + "task_loss": 0.4338461756706238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04071110400012435, + "compression/movement_sparsity/importance_threshold": -0.3308506600133537, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16433964669704437, + "epoch": 4.96, + "learning_rate": 1.455506731634466e-07, + "loss": 0.1771, + "step": 13720, + "task_loss": 0.1609778255224228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.040763297011405464, + "compression/movement_sparsity/importance_threshold": -0.33013971406379183, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16652318835258484, + "epoch": 4.96, + "learning_rate": 1.445690891511515e-07, + "loss": 0.1792, + "step": 13730, + "task_loss": 0.5048503279685974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04081541519937309, + "compression/movement_sparsity/importance_threshold": -0.3294297873183408, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17034050822257996, + "epoch": 4.97, + "learning_rate": 1.4359056825691785e-07, + "loss": 0.1854, + "step": 13740, + "task_loss": 0.6351929903030396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0408674586176986, + "compression/movement_sparsity/importance_threshold": -0.3287208790459173, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14872363209724426, + "epoch": 4.97, + "learning_rate": 1.4261511398462333e-07, + "loss": 0.1821, + "step": 13750, + "task_loss": 0.3965833783149719 + }, + { + "epoch": 4.97, + "eval_exact_match": 83.62346263008514, + "eval_f1": 89.91764481115209, + "step": 13750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04091942732005344, + "compression/movement_sparsity/importance_threshold": -0.3280129885154368, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14365430176258087, + "epoch": 4.97, + "learning_rate": 1.4164272982716385e-07, + "loss": 0.1755, + "step": 13760, + "task_loss": 0.46597981452941895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04097132136010901, + "compression/movement_sparsity/importance_threshold": -0.32730611499581574, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13839992880821228, + "epoch": 4.98, + "learning_rate": 1.4067341926644283e-07, + "loss": 0.172, + "step": 13770, + "task_loss": 0.33191508054733276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04102314079153671, + "compression/movement_sparsity/importance_threshold": -0.32660025775597, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14204658567905426, + "epoch": 4.98, + "learning_rate": 1.3970718577335728e-07, + "loss": 0.1765, + "step": 13780, + "task_loss": 0.2808571457862854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04107488566800795, + "compression/movement_sparsity/importance_threshold": -0.32589541606481587, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1467570811510086, + "epoch": 4.98, + "learning_rate": 1.3874403280778602e-07, + "loss": 0.1718, + "step": 13790, + "task_loss": 0.19776900112628937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.041126556043194165, + "compression/movement_sparsity/importance_threshold": -0.325191589191269, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16088761389255524, + "epoch": 4.99, + "learning_rate": 1.377839638185774e-07, + "loss": 0.1789, + "step": 13800, + "task_loss": 0.6403120756149292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04117815197076674, + "compression/movement_sparsity/importance_threshold": -0.3244887764042459, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15906648337841034, + "epoch": 4.99, + "learning_rate": 1.3682698224353584e-07, + "loss": 0.1813, + "step": 13810, + "task_loss": 0.4403289556503296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.041229673504397095, + "compression/movement_sparsity/importance_threshold": -0.32378697697266245, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1257646381855011, + "epoch": 4.99, + "learning_rate": 1.3587309150941152e-07, + "loss": 0.177, + "step": 13820, + "task_loss": 0.3140348196029663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04128112069775663, + "compression/movement_sparsity/importance_threshold": -0.32308619016543483, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21181027591228485, + "epoch": 5.0, + "learning_rate": 1.349222950318859e-07, + "loss": 0.1915, + "step": 13830, + "task_loss": 0.4248882830142975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04133249360451678, + "compression/movement_sparsity/importance_threshold": -0.3223864152514787, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1628672480583191, + "epoch": 5.0, + "learning_rate": 1.3397459621556128e-07, + "loss": 0.1719, + "step": 13840, + "task_loss": 0.3244702219963074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04138379227834893, + "compression/movement_sparsity/importance_threshold": -0.3216876514997108, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13734124600887299, + "epoch": 5.01, + "learning_rate": 1.3302999845394802e-07, + "loss": 0.1865, + "step": 13850, + "task_loss": 0.36722129583358765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.041435016772924514, + "compression/movement_sparsity/importance_threshold": -0.32098989817904644, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1720079779624939, + "epoch": 5.01, + "learning_rate": 1.3208850512945135e-07, + "loss": 0.1831, + "step": 13860, + "task_loss": 0.3153786063194275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04148616714191491, + "compression/movement_sparsity/importance_threshold": -0.32029315455840246, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1428808569908142, + "epoch": 5.01, + "learning_rate": 1.311501196133612e-07, + "loss": 0.1775, + "step": 13870, + "task_loss": 0.6260837912559509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04153724343899157, + "compression/movement_sparsity/importance_threshold": -0.3195974199066942, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14632534980773926, + "epoch": 5.02, + "learning_rate": 1.3021484526583814e-07, + "loss": 0.1824, + "step": 13880, + "task_loss": 0.2916293740272522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04158824571782587, + "compression/movement_sparsity/importance_threshold": -0.3189026934928383, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15432986617088318, + "epoch": 5.02, + "learning_rate": 1.2928268543590304e-07, + "loss": 0.1779, + "step": 13890, + "task_loss": 0.4513210654258728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.041639174032089235, + "compression/movement_sparsity/importance_threshold": -0.3182089745857505, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1750912070274353, + "epoch": 5.02, + "learning_rate": 1.2835364346142397e-07, + "loss": 0.1778, + "step": 13900, + "task_loss": 0.5073824524879456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04169002843545308, + "compression/movement_sparsity/importance_threshold": -0.31751626245434694, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19097644090652466, + "epoch": 5.03, + "learning_rate": 1.2742772266910485e-07, + "loss": 0.1807, + "step": 13910, + "task_loss": 0.4349386692047119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04174080898158881, + "compression/movement_sparsity/importance_threshold": -0.3168245563675438, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.185117706656456, + "epoch": 5.03, + "learning_rate": 1.265049263744734e-07, + "loss": 0.1708, + "step": 13920, + "task_loss": 0.3032917380332947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04179151572416783, + "compression/movement_sparsity/importance_threshold": -0.3161338555942569, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1254778802394867, + "epoch": 5.03, + "learning_rate": 1.2558525788186834e-07, + "loss": 0.1649, + "step": 13930, + "task_loss": 0.5277553200721741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.041842148716861566, + "compression/movement_sparsity/importance_threshold": -0.31544415940340254, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1639249622821808, + "epoch": 5.04, + "learning_rate": 1.2466872048442935e-07, + "loss": 0.1671, + "step": 13940, + "task_loss": 0.4619181752204895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04189270801334141, + "compression/movement_sparsity/importance_threshold": -0.3147554670638968, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15116430819034576, + "epoch": 5.04, + "learning_rate": 1.237553174640842e-07, + "loss": 0.1769, + "step": 13950, + "task_loss": 0.2680996358394623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.041943193667278784, + "compression/movement_sparsity/importance_threshold": -0.31406777784465545, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15021368861198425, + "epoch": 5.05, + "learning_rate": 1.228450520915364e-07, + "loss": 0.1827, + "step": 13960, + "task_loss": 0.4265400767326355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0419936057323451, + "compression/movement_sparsity/importance_threshold": -0.3133810910145949, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1929803192615509, + "epoch": 5.05, + "learning_rate": 1.21937927626255e-07, + "loss": 0.1773, + "step": 13970, + "task_loss": 0.5100224614143372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04204394426221176, + "compression/movement_sparsity/importance_threshold": -0.312695405842631, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15454885363578796, + "epoch": 5.05, + "learning_rate": 1.2103394731646143e-07, + "loss": 0.1715, + "step": 13980, + "task_loss": 0.6407804489135742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04209420931055017, + "compression/movement_sparsity/importance_threshold": -0.3120107215976802, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17369097471237183, + "epoch": 5.06, + "learning_rate": 1.2013311439911954e-07, + "loss": 0.1763, + "step": 13990, + "task_loss": 0.4219602346420288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.042144400931031764, + "compression/movement_sparsity/importance_threshold": -0.31132703754865787, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1915154606103897, + "epoch": 5.06, + "learning_rate": 1.1923543209992183e-07, + "loss": 0.1761, + "step": 14000, + "task_loss": 0.3387451767921448 + }, + { + "epoch": 5.06, + "eval_exact_match": 83.67076631977294, + "eval_f1": 90.03655684421615, + "step": 14000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04219451917732794, + "compression/movement_sparsity/importance_threshold": -0.31064435296448056, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.153361976146698, + "epoch": 5.06, + "learning_rate": 1.1834090363327986e-07, + "loss": 0.1855, + "step": 14010, + "task_loss": 0.576560378074646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.042244564103110095, + "compression/movement_sparsity/importance_threshold": -0.3099626671140644, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14965105056762695, + "epoch": 5.07, + "learning_rate": 1.174495322023118e-07, + "loss": 0.1879, + "step": 14020, + "task_loss": 0.4468023478984833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.042294535762049654, + "compression/movement_sparsity/importance_threshold": -0.3092819792663253, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15595000982284546, + "epoch": 5.07, + "learning_rate": 1.1656132099883131e-07, + "loss": 0.1716, + "step": 14030, + "task_loss": 0.15840017795562744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04234443420781801, + "compression/movement_sparsity/importance_threshold": -0.30860228869017936, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.147873193025589, + "epoch": 5.07, + "learning_rate": 1.1567627320333594e-07, + "loss": 0.1864, + "step": 14040, + "task_loss": 0.8370780348777771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04239425949408662, + "compression/movement_sparsity/importance_threshold": -0.30792359465454233, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15930218994617462, + "epoch": 5.08, + "learning_rate": 1.1479439198499519e-07, + "loss": 0.179, + "step": 14050, + "task_loss": 0.4646834135055542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04244401167452685, + "compression/movement_sparsity/importance_threshold": -0.3072458964283309, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15202897787094116, + "epoch": 5.08, + "learning_rate": 1.1391568050164014e-07, + "loss": 0.1753, + "step": 14060, + "task_loss": 0.4561488628387451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04249369080281012, + "compression/movement_sparsity/importance_threshold": -0.3065691932804606, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18747982382774353, + "epoch": 5.08, + "learning_rate": 1.1304014189975197e-07, + "loss": 0.1802, + "step": 14070, + "task_loss": 0.33272579312324524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.042543296932607846, + "compression/movement_sparsity/importance_threshold": -0.30589348447984777, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13845214247703552, + "epoch": 5.09, + "learning_rate": 1.1216777931444987e-07, + "loss": 0.1822, + "step": 14080, + "task_loss": 0.27083098888397217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.042592830117591435, + "compression/movement_sparsity/importance_threshold": -0.3052187692954085, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16344168782234192, + "epoch": 5.09, + "learning_rate": 1.1129859586948098e-07, + "loss": 0.1797, + "step": 14090, + "task_loss": 0.4010601043701172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04264229041143231, + "compression/movement_sparsity/importance_threshold": -0.30454504699605844, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15759651362895966, + "epoch": 5.1, + "learning_rate": 1.1043259467720778e-07, + "loss": 0.1782, + "step": 14100, + "task_loss": 0.30105510354042053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.042691677867801855, + "compression/movement_sparsity/importance_threshold": -0.30387231685071436, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17411063611507416, + "epoch": 5.1, + "learning_rate": 1.0956977883859886e-07, + "loss": 0.1891, + "step": 14110, + "task_loss": 0.466052770614624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0427409925403715, + "compression/movement_sparsity/importance_threshold": -0.3032005781282918, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11521792411804199, + "epoch": 5.1, + "learning_rate": 1.0871015144321571e-07, + "loss": 0.1681, + "step": 14120, + "task_loss": 0.5160799026489258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.042790234482812675, + "compression/movement_sparsity/importance_threshold": -0.30252983009770684, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1370161473751068, + "epoch": 5.11, + "learning_rate": 1.078537155692032e-07, + "loss": 0.1707, + "step": 14130, + "task_loss": 0.19232416152954102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.042839403748796745, + "compression/movement_sparsity/importance_threshold": -0.3018600720278759, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14609548449516296, + "epoch": 5.11, + "learning_rate": 1.0700047428327818e-07, + "loss": 0.1588, + "step": 14140, + "task_loss": 0.476870059967041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04288850039199515, + "compression/movement_sparsity/importance_threshold": -0.3011913031877147, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13242539763450623, + "epoch": 5.11, + "learning_rate": 1.0615043064071783e-07, + "loss": 0.1783, + "step": 14150, + "task_loss": 0.4068664014339447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04293752446607929, + "compression/movement_sparsity/importance_threshold": -0.3005235228461395, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15072208642959595, + "epoch": 5.12, + "learning_rate": 1.0530358768534997e-07, + "loss": 0.1821, + "step": 14160, + "task_loss": 0.6981538534164429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04298647602472058, + "compression/movement_sparsity/importance_threshold": -0.29985673027206616, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17192161083221436, + "epoch": 5.12, + "learning_rate": 1.0445994844954064e-07, + "loss": 0.1739, + "step": 14170, + "task_loss": 0.5850452184677124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.043035355121590436, + "compression/movement_sparsity/importance_threshold": -0.299190924734411, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1931232213973999, + "epoch": 5.12, + "learning_rate": 1.0361951595418439e-07, + "loss": 0.1794, + "step": 14180, + "task_loss": 0.3741414248943329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04308416181036027, + "compression/movement_sparsity/importance_threshold": -0.29852610550209, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14067162573337555, + "epoch": 5.13, + "learning_rate": 1.0278229320869336e-07, + "loss": 0.1783, + "step": 14190, + "task_loss": 0.3252699673175812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04313289614470146, + "compression/movement_sparsity/importance_threshold": -0.29786227184401926, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21310460567474365, + "epoch": 5.13, + "learning_rate": 1.0194828321098569e-07, + "loss": 0.1935, + "step": 14200, + "task_loss": 0.5130503177642822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.043181558178285465, + "compression/movement_sparsity/importance_threshold": -0.2971994230291146, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17947247624397278, + "epoch": 5.14, + "learning_rate": 1.0111748894747596e-07, + "loss": 0.1843, + "step": 14210, + "task_loss": 0.36428236961364746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04323014796478367, + "compression/movement_sparsity/importance_threshold": -0.29653755832629225, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12573717534542084, + "epoch": 5.14, + "learning_rate": 1.0028991339306336e-07, + "loss": 0.1788, + "step": 14220, + "task_loss": 0.6074889898300171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04327866555786748, + "compression/movement_sparsity/importance_threshold": -0.2958766770044685, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14682647585868835, + "epoch": 5.14, + "learning_rate": 9.946555951112178e-08, + "loss": 0.185, + "step": 14230, + "task_loss": 0.3594474196434021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04332711101120833, + "compression/movement_sparsity/importance_threshold": -0.2952167783325591, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11430035531520844, + "epoch": 5.15, + "learning_rate": 9.864443025348934e-08, + "loss": 0.169, + "step": 14240, + "task_loss": 0.2298082858324051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0433754843784776, + "compression/movement_sparsity/importance_threshold": -0.2945578615794803, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1614457666873932, + "epoch": 5.15, + "learning_rate": 9.782652856045648e-08, + "loss": 0.1708, + "step": 14250, + "task_loss": 0.7426341772079468 + }, + { + "epoch": 5.15, + "eval_exact_match": 83.74645222327341, + "eval_f1": 90.08076277207161, + "step": 14250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04342378571334672, + "compression/movement_sparsity/importance_threshold": -0.29389992601414805, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1826505959033966, + "epoch": 5.15, + "learning_rate": 9.701185736075756e-08, + "loss": 0.1722, + "step": 14260, + "task_loss": 0.6813669800758362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.043472015069487085, + "compression/movement_sparsity/importance_threshold": -0.2932429709054786, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1705678105354309, + "epoch": 5.16, + "learning_rate": 9.620041957155834e-08, + "loss": 0.1868, + "step": 14270, + "task_loss": 0.464776873588562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04352017250057013, + "compression/movement_sparsity/importance_threshold": -0.29258699552238776, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16456995904445648, + "epoch": 5.16, + "learning_rate": 9.539221809844722e-08, + "loss": 0.1777, + "step": 14280, + "task_loss": 0.4324108362197876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04356825806026725, + "compression/movement_sparsity/importance_threshold": -0.29193199913379164, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17111265659332275, + "epoch": 5.16, + "learning_rate": 9.458725583542315e-08, + "loss": 0.1817, + "step": 14290, + "task_loss": 0.4643493592739105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.043616271802249855, + "compression/movement_sparsity/importance_threshold": -0.29127798100860647, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1873432695865631, + "epoch": 5.17, + "learning_rate": 9.378553566488668e-08, + "loss": 0.1764, + "step": 14300, + "task_loss": 0.4464597702026367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04366421378018936, + "compression/movement_sparsity/importance_threshold": -0.29062494041574827, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1443527191877365, + "epoch": 5.17, + "learning_rate": 9.298706045762927e-08, + "loss": 0.1753, + "step": 14310, + "task_loss": 0.3243370056152344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04371208404775718, + "compression/movement_sparsity/importance_threshold": -0.28997287662413296, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13130953907966614, + "epoch": 5.18, + "learning_rate": 9.219183307282219e-08, + "loss": 0.1697, + "step": 14320, + "task_loss": 0.5415463447570801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04375988265862469, + "compression/movement_sparsity/importance_threshold": -0.2893217889026771, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2080850899219513, + "epoch": 5.18, + "learning_rate": 9.139985635800784e-08, + "loss": 0.1908, + "step": 14330, + "task_loss": 0.3747154474258423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04380760966646336, + "compression/movement_sparsity/importance_threshold": -0.288671676520296, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14363014698028564, + "epoch": 5.18, + "learning_rate": 9.061113314908764e-08, + "loss": 0.1722, + "step": 14340, + "task_loss": 0.6961146593093872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04385526512494456, + "compression/movement_sparsity/importance_threshold": -0.2880225387459062, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16117852926254272, + "epoch": 5.19, + "learning_rate": 8.982566627031363e-08, + "loss": 0.1782, + "step": 14350, + "task_loss": 0.3901559114456177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.043902849087739705, + "compression/movement_sparsity/importance_threshold": -0.28737437484842376, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1831967532634735, + "epoch": 5.19, + "learning_rate": 8.904345853427753e-08, + "loss": 0.1756, + "step": 14360, + "task_loss": 0.4879865348339081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04395036160852022, + "compression/movement_sparsity/importance_threshold": -0.28672718409676456, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13761597871780396, + "epoch": 5.19, + "learning_rate": 8.826451274190039e-08, + "loss": 0.1651, + "step": 14370, + "task_loss": 0.3214653432369232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.043997802740957506, + "compression/movement_sparsity/importance_threshold": -0.28608096575984476, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2639523446559906, + "epoch": 5.2, + "learning_rate": 8.748883168242327e-08, + "loss": 0.1827, + "step": 14380, + "task_loss": 0.5773433446884155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04404517253872297, + "compression/movement_sparsity/importance_threshold": -0.2854357191065805, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12245256453752518, + "epoch": 5.2, + "learning_rate": 8.671641813339681e-08, + "loss": 0.18, + "step": 14390, + "task_loss": 0.33123326301574707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04409247105548803, + "compression/movement_sparsity/importance_threshold": -0.2847914434058878, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17285960912704468, + "epoch": 5.2, + "learning_rate": 8.594727486067155e-08, + "loss": 0.1814, + "step": 14400, + "task_loss": 0.43380165100097656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04413969834492409, + "compression/movement_sparsity/importance_threshold": -0.2841481379266828, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18825030326843262, + "epoch": 5.21, + "learning_rate": 8.518140461838729e-08, + "loss": 0.1752, + "step": 14410, + "task_loss": 0.38604140281677246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044186854460702585, + "compression/movement_sparsity/importance_threshold": -0.2835058019378811, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1609213650226593, + "epoch": 5.21, + "learning_rate": 8.441881014896434e-08, + "loss": 0.1699, + "step": 14420, + "task_loss": 0.5970258116722107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044233939456494877, + "compression/movement_sparsity/importance_threshold": -0.2828644347083996, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14135953783988953, + "epoch": 5.22, + "learning_rate": 8.365949418309327e-08, + "loss": 0.1687, + "step": 14430, + "task_loss": 0.2799544334411621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04428095338597242, + "compression/movement_sparsity/importance_threshold": -0.2822240355071536, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15112349390983582, + "epoch": 5.22, + "learning_rate": 8.290345943972433e-08, + "loss": 0.1782, + "step": 14440, + "task_loss": 0.41910994052886963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044327896302806605, + "compression/movement_sparsity/importance_threshold": -0.2815846036030596, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19787323474884033, + "epoch": 5.22, + "learning_rate": 8.215070862605922e-08, + "loss": 0.1737, + "step": 14450, + "task_loss": 0.4692561626434326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04437476826066885, + "compression/movement_sparsity/importance_threshold": -0.28094613826503345, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18470437824726105, + "epoch": 5.23, + "learning_rate": 8.140124443753982e-08, + "loss": 0.1712, + "step": 14460, + "task_loss": 0.45406365394592285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04442156931323056, + "compression/movement_sparsity/importance_threshold": -0.2803086387619913, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13995935022830963, + "epoch": 5.23, + "learning_rate": 8.065506955783985e-08, + "loss": 0.1751, + "step": 14470, + "task_loss": 0.5951769948005676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04446829951416315, + "compression/movement_sparsity/importance_threshold": -0.2796721043628493, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21652866899967194, + "epoch": 5.23, + "learning_rate": 7.991218665885458e-08, + "loss": 0.1869, + "step": 14480, + "task_loss": 0.4158879518508911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044514958917138026, + "compression/movement_sparsity/importance_threshold": -0.27903653433652353, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16511324048042297, + "epoch": 5.24, + "learning_rate": 7.917259840069112e-08, + "loss": 0.1711, + "step": 14490, + "task_loss": 0.42348212003707886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04456154757582661, + "compression/movement_sparsity/importance_threshold": -0.27840192795192975, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18495193123817444, + "epoch": 5.24, + "learning_rate": 7.843630743165952e-08, + "loss": 0.1812, + "step": 14500, + "task_loss": 0.6287417411804199 + }, + { + "epoch": 5.24, + "eval_exact_match": 83.6329233680227, + "eval_f1": 90.02631898119101, + "step": 14500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044608065543900295, + "compression/movement_sparsity/importance_threshold": -0.2777682844779844, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.179265558719635, + "epoch": 5.24, + "learning_rate": 7.770331638826266e-08, + "loss": 0.1807, + "step": 14510, + "task_loss": 0.5263575315475464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04465451287503051, + "compression/movement_sparsity/importance_threshold": -0.2771356031836033, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1451658308506012, + "epoch": 5.25, + "learning_rate": 7.697362789518757e-08, + "loss": 0.1739, + "step": 14520, + "task_loss": 0.36418962478637695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044700889622888644, + "compression/movement_sparsity/importance_threshold": -0.2765038833377026, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1474297046661377, + "epoch": 5.25, + "learning_rate": 7.624724456529475e-08, + "loss": 0.1855, + "step": 14530, + "task_loss": 0.3640234172344208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04474719584114613, + "compression/movement_sparsity/importance_threshold": -0.2758731242091984, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13829626142978668, + "epoch": 5.25, + "learning_rate": 7.552416899961011e-08, + "loss": 0.1974, + "step": 14540, + "task_loss": 0.28583553433418274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04479343158347436, + "compression/movement_sparsity/importance_threshold": -0.2752433250670069, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1722092181444168, + "epoch": 5.26, + "learning_rate": 7.48762312690956e-08, + "loss": 0.1966, + "step": 14550, + "task_loss": 0.3825565278530121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044839596903544755, + "compression/movement_sparsity/importance_threshold": -0.27461448518004383, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1537836194038391, + "epoch": 5.26, + "learning_rate": 7.415944757880465e-08, + "loss": 0.1829, + "step": 14560, + "task_loss": 0.42369458079338074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04488569185502873, + "compression/movement_sparsity/importance_threshold": -0.2739866038172255, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15801842510700226, + "epoch": 5.27, + "learning_rate": 7.344597912868367e-08, + "loss": 0.1636, + "step": 14570, + "task_loss": 0.5285253524780273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044931716491597676, + "compression/movement_sparsity/importance_threshold": -0.27335968024746793, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15832439064979553, + "epoch": 5.27, + "learning_rate": 7.273582847351289e-08, + "loss": 0.1715, + "step": 14580, + "task_loss": 0.4416083097457886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044977670866923024, + "compression/movement_sparsity/importance_threshold": -0.27273371373968713, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18099182844161987, + "epoch": 5.27, + "learning_rate": 7.202899815619234e-08, + "loss": 0.1769, + "step": 14590, + "task_loss": 0.6669715046882629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045023555034676165, + "compression/movement_sparsity/importance_threshold": -0.27210870356279937, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1620885580778122, + "epoch": 5.28, + "learning_rate": 7.132549070773286e-08, + "loss": 0.1729, + "step": 14600, + "task_loss": 0.25877806544303894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04506936904852855, + "compression/movement_sparsity/importance_threshold": -0.2714846489857202, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14744767546653748, + "epoch": 5.28, + "learning_rate": 7.062530864724625e-08, + "loss": 0.1746, + "step": 14610, + "task_loss": 0.5256980061531067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04511511296215154, + "compression/movement_sparsity/importance_threshold": -0.2708615492773664, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18050046265125275, + "epoch": 5.28, + "learning_rate": 6.99284544819373e-08, + "loss": 0.1755, + "step": 14620, + "task_loss": 0.4965248703956604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04516078682921657, + "compression/movement_sparsity/importance_threshold": -0.27023940370665356, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15234878659248352, + "epoch": 5.29, + "learning_rate": 6.923493070709397e-08, + "loss": 0.1781, + "step": 14630, + "task_loss": 0.3228938579559326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04520639070339504, + "compression/movement_sparsity/importance_threshold": -0.2696182115424979, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16729338467121124, + "epoch": 5.29, + "learning_rate": 6.85447398060791e-08, + "loss": 0.1709, + "step": 14640, + "task_loss": 0.8934930562973022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045251924638358386, + "compression/movement_sparsity/importance_threshold": -0.26899797205381526, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15839503705501556, + "epoch": 5.29, + "learning_rate": 6.785788425032124e-08, + "loss": 0.1794, + "step": 14650, + "task_loss": 0.33643460273742676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04529738868777799, + "compression/movement_sparsity/importance_threshold": -0.2683786845095222, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19412821531295776, + "epoch": 5.3, + "learning_rate": 6.717436649930508e-08, + "loss": 0.1924, + "step": 14660, + "task_loss": 0.9410861134529114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04534278290532528, + "compression/movement_sparsity/importance_threshold": -0.2677603481785342, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12971442937850952, + "epoch": 5.3, + "learning_rate": 6.649418900056425e-08, + "loss": 0.1753, + "step": 14670, + "task_loss": 0.26334819197654724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045388107344671656, + "compression/movement_sparsity/importance_threshold": -0.26714296232976775, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1588962972164154, + "epoch": 5.31, + "learning_rate": 6.581735418967094e-08, + "loss": 0.1962, + "step": 14680, + "task_loss": 0.35144782066345215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04543336205948853, + "compression/movement_sparsity/importance_threshold": -0.2665265262321388, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15858691930770874, + "epoch": 5.31, + "learning_rate": 6.514386449022846e-08, + "loss": 0.1875, + "step": 14690, + "task_loss": 0.45107176899909973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04547854710344731, + "compression/movement_sparsity/importance_threshold": -0.2659110391545635, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15156474709510803, + "epoch": 5.31, + "learning_rate": 6.447372231386138e-08, + "loss": 0.1698, + "step": 14700, + "task_loss": 0.41857224702835083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04552366253021943, + "compression/movement_sparsity/importance_threshold": -0.26529650036595753, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1653667539358139, + "epoch": 5.32, + "learning_rate": 6.380693006020788e-08, + "loss": 0.1625, + "step": 14710, + "task_loss": 0.32113200426101685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04556870839347626, + "compression/movement_sparsity/importance_threshold": -0.2646829091352375, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1332647204399109, + "epoch": 5.32, + "learning_rate": 6.3143490116911e-08, + "loss": 0.164, + "step": 14720, + "task_loss": 0.2849538326263428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045613684746889246, + "compression/movement_sparsity/importance_threshold": -0.264070264731319, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1563533991575241, + "epoch": 5.32, + "learning_rate": 6.248340485960912e-08, + "loss": 0.177, + "step": 14730, + "task_loss": 0.27913862466812134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04565859164412978, + "compression/movement_sparsity/importance_threshold": -0.2634585664231185, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15281318128108978, + "epoch": 5.33, + "learning_rate": 6.182667665192876e-08, + "loss": 0.1773, + "step": 14740, + "task_loss": 0.2936190664768219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04570342913886928, + "compression/movement_sparsity/importance_threshold": -0.2628478134795518, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15253296494483948, + "epoch": 5.33, + "learning_rate": 6.117330784547547e-08, + "loss": 0.179, + "step": 14750, + "task_loss": 0.4092378616333008 + }, + { + "epoch": 5.33, + "eval_exact_match": 83.69914853358561, + "eval_f1": 90.03206384226705, + "step": 14750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04574819728477916, + "compression/movement_sparsity/importance_threshold": -0.262238005169535, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13614681363105774, + "epoch": 5.33, + "learning_rate": 6.052330077982548e-08, + "loss": 0.1718, + "step": 14760, + "task_loss": 0.23583835363388062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04579289613553082, + "compression/movement_sparsity/importance_threshold": -0.2616291407619843, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17683929204940796, + "epoch": 5.34, + "learning_rate": 5.987665778251739e-08, + "loss": 0.1807, + "step": 14770, + "task_loss": 0.3152199983596802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04583752574479569, + "compression/movement_sparsity/importance_threshold": -0.2610212195258156, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17758771777153015, + "epoch": 5.34, + "learning_rate": 5.9233381169043415e-08, + "loss": 0.1767, + "step": 14780, + "task_loss": 0.6586979031562805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045882086166245145, + "compression/movement_sparsity/importance_threshold": -0.2604142407299451, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15100859105587006, + "epoch": 5.35, + "learning_rate": 5.8593473242842026e-08, + "loss": 0.1851, + "step": 14790, + "task_loss": 0.5096355676651001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045926577453550624, + "compression/movement_sparsity/importance_threshold": -0.2598082036432888, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15634649991989136, + "epoch": 5.35, + "learning_rate": 5.795693629528842e-08, + "loss": 0.1731, + "step": 14800, + "task_loss": 0.28395843505859375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04597099966038353, + "compression/movement_sparsity/importance_threshold": -0.25920310753476283, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17545104026794434, + "epoch": 5.35, + "learning_rate": 5.732377260568777e-08, + "loss": 0.1671, + "step": 14810, + "task_loss": 0.3866184949874878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04601535284041529, + "compression/movement_sparsity/importance_threshold": -0.2585989516732832, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1806982159614563, + "epoch": 5.36, + "learning_rate": 5.669398444126605e-08, + "loss": 0.176, + "step": 14820, + "task_loss": 0.5448100566864014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.046059637047317276, + "compression/movement_sparsity/importance_threshold": -0.257995735327766, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13397540152072906, + "epoch": 5.36, + "learning_rate": 5.606757405716189e-08, + "loss": 0.1662, + "step": 14830, + "task_loss": 0.18283666670322418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.046103852334760935, + "compression/movement_sparsity/importance_threshold": -0.2573934577671272, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1665625274181366, + "epoch": 5.36, + "learning_rate": 5.544454369641927e-08, + "loss": 0.178, + "step": 14840, + "task_loss": 0.37204509973526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04614799875641766, + "compression/movement_sparsity/importance_threshold": -0.25679211826028303, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15637090802192688, + "epoch": 5.37, + "learning_rate": 5.482489558997849e-08, + "loss": 0.1857, + "step": 14850, + "task_loss": 0.6641359925270081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04619207636595886, + "compression/movement_sparsity/importance_threshold": -0.25619171607614943, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1537550389766693, + "epoch": 5.37, + "learning_rate": 5.420863195666925e-08, + "loss": 0.1582, + "step": 14860, + "task_loss": 0.2819337844848633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04623608521705596, + "compression/movement_sparsity/importance_threshold": -0.25559225048364254, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1731978952884674, + "epoch": 5.37, + "learning_rate": 5.35957550032019e-08, + "loss": 0.1711, + "step": 14870, + "task_loss": 0.3445536494255066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04628002536338036, + "compression/movement_sparsity/importance_threshold": -0.25499372075167837, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1405542939901352, + "epoch": 5.38, + "learning_rate": 5.298626692415975e-08, + "loss": 0.1704, + "step": 14880, + "task_loss": 0.4534985423088074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04632389685860346, + "compression/movement_sparsity/importance_threshold": -0.2543961261491732, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1388224959373474, + "epoch": 5.38, + "learning_rate": 5.238016990199157e-08, + "loss": 0.1652, + "step": 14890, + "task_loss": 0.3235897123813629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.046367699756396714, + "compression/movement_sparsity/importance_threshold": -0.25379946594504255, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1863054782152176, + "epoch": 5.38, + "learning_rate": 5.1777466107002844e-08, + "loss": 0.1791, + "step": 14900, + "task_loss": 0.5134084224700928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.046411434110431464, + "compression/movement_sparsity/importance_threshold": -0.2532037394082032, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16694492101669312, + "epoch": 5.39, + "learning_rate": 5.117815769734946e-08, + "loss": 0.1711, + "step": 14910, + "task_loss": 0.45068734884262085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04645509997437919, + "compression/movement_sparsity/importance_threshold": -0.25260894580757065, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1374826282262802, + "epoch": 5.39, + "learning_rate": 5.058224681902834e-08, + "loss": 0.1673, + "step": 14920, + "task_loss": 0.1980714350938797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.046498697401911246, + "compression/movement_sparsity/importance_threshold": -0.25201508441206133, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1443094164133072, + "epoch": 5.4, + "learning_rate": 4.998973560587105e-08, + "loss": 0.1678, + "step": 14930, + "task_loss": 0.2809217870235443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04654222644669908, + "compression/movement_sparsity/importance_threshold": -0.251422154490591, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1853853315114975, + "epoch": 5.4, + "learning_rate": 4.940062617953567e-08, + "loss": 0.1701, + "step": 14940, + "task_loss": 0.3242225646972656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04658568716241408, + "compression/movement_sparsity/importance_threshold": -0.2508301553120761, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16819274425506592, + "epoch": 5.4, + "learning_rate": 4.881492064949888e-08, + "loss": 0.1736, + "step": 14950, + "task_loss": 0.5310601592063904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04662907960272768, + "compression/movement_sparsity/importance_threshold": -0.25023908614543233, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15723347663879395, + "epoch": 5.41, + "learning_rate": 4.823262111304904e-08, + "loss": 0.1747, + "step": 14960, + "task_loss": 0.5195972919464111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04667240382131126, + "compression/movement_sparsity/importance_threshold": -0.249648946259576, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1590065211057663, + "epoch": 5.41, + "learning_rate": 4.7653729655278254e-08, + "loss": 0.1759, + "step": 14970, + "task_loss": 0.4165058135986328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04671565987183626, + "compression/movement_sparsity/importance_threshold": -0.24905973492342304, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17201870679855347, + "epoch": 5.41, + "learning_rate": 4.707824834907481e-08, + "loss": 0.1829, + "step": 14980, + "task_loss": 0.3914673924446106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.046758847807974076, + "compression/movement_sparsity/importance_threshold": -0.24847145140588967, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18516141176223755, + "epoch": 5.42, + "learning_rate": 4.650617925511635e-08, + "loss": 0.1768, + "step": 14990, + "task_loss": 0.47868090867996216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04680196768339613, + "compression/movement_sparsity/importance_threshold": -0.2478840949758917, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13710492849349976, + "epoch": 5.42, + "learning_rate": 4.5937524421861826e-08, + "loss": 0.1677, + "step": 15000, + "task_loss": 0.3779285252094269 + }, + { + "epoch": 5.42, + "eval_exact_match": 83.49101229895932, + "eval_f1": 89.94383123761723, + "step": 15000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04684501955177381, + "compression/movement_sparsity/importance_threshold": -0.2472976649023455, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17632843554019928, + "epoch": 5.42, + "learning_rate": 4.537228588554476e-08, + "loss": 0.1732, + "step": 15010, + "task_loss": 0.6332323551177979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.046888003466778544, + "compression/movement_sparsity/importance_threshold": -0.24671216045416688, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16848604381084442, + "epoch": 5.43, + "learning_rate": 4.4810465670164886e-08, + "loss": 0.1745, + "step": 15020, + "task_loss": 0.586732029914856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04693091948208174, + "compression/movement_sparsity/importance_threshold": -0.24612758090027198, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16656005382537842, + "epoch": 5.43, + "learning_rate": 4.425206578748275e-08, + "loss": 0.1607, + "step": 15030, + "task_loss": 0.4396723508834839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0469737676513548, + "compression/movement_sparsity/importance_threshold": -0.24554392550957704, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1422872245311737, + "epoch": 5.44, + "learning_rate": 4.369708823701024e-08, + "loss": 0.1739, + "step": 15040, + "task_loss": 0.4501122534275055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047016548028269156, + "compression/movement_sparsity/importance_threshold": -0.24496119355099788, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15386220812797546, + "epoch": 5.44, + "learning_rate": 4.31455350060056e-08, + "loss": 0.1732, + "step": 15050, + "task_loss": 0.2968456447124481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04705926066649619, + "compression/movement_sparsity/importance_threshold": -0.24437938429345074, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18353886902332306, + "epoch": 5.44, + "learning_rate": 4.259740806946477e-08, + "loss": 0.1725, + "step": 15060, + "task_loss": 0.4033396244049072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047101905619707334, + "compression/movement_sparsity/importance_threshold": -0.24379849700585166, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1291903555393219, + "epoch": 5.45, + "learning_rate": 4.205270939011474e-08, + "loss": 0.1536, + "step": 15070, + "task_loss": 0.38127729296684265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047144482941573994, + "compression/movement_sparsity/importance_threshold": -0.24321853095711654, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1640148162841797, + "epoch": 5.45, + "learning_rate": 4.151144091840708e-08, + "loss": 0.1776, + "step": 15080, + "task_loss": 0.6958640813827515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04718699268576758, + "compression/movement_sparsity/importance_threshold": -0.24263948541616154, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17455732822418213, + "epoch": 5.45, + "learning_rate": 4.0973604592510094e-08, + "loss": 0.184, + "step": 15090, + "task_loss": 0.2959747612476349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047229434905959496, + "compression/movement_sparsity/importance_threshold": -0.2420613596519029, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13450655341148376, + "epoch": 5.46, + "learning_rate": 4.043920233830267e-08, + "loss": 0.1819, + "step": 15100, + "task_loss": 0.2962379455566406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04727180965582116, + "compression/movement_sparsity/importance_threshold": -0.24148415293325654, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15026503801345825, + "epoch": 5.46, + "learning_rate": 3.990823606936666e-08, + "loss": 0.1767, + "step": 15110, + "task_loss": 0.7710201740264893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04731411698902398, + "compression/movement_sparsity/importance_threshold": -0.24090786452913848, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1360689401626587, + "epoch": 5.46, + "learning_rate": 3.938070768698054e-08, + "loss": 0.1722, + "step": 15120, + "task_loss": 0.46412721276283264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047356356959239364, + "compression/movement_sparsity/importance_threshold": -0.24033249370846488, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1195104569196701, + "epoch": 5.47, + "learning_rate": 3.885661908011273e-08, + "loss": 0.1808, + "step": 15130, + "task_loss": 0.4792066216468811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04739852962013873, + "compression/movement_sparsity/importance_threshold": -0.23975803974015175, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1432269811630249, + "epoch": 5.47, + "learning_rate": 3.833597212541373e-08, + "loss": 0.1799, + "step": 15140, + "task_loss": 0.41720786690711975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047440635025393474, + "compression/movement_sparsity/importance_threshold": -0.23918450189311524, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1752137839794159, + "epoch": 5.48, + "learning_rate": 3.781876868721112e-08, + "loss": 0.1808, + "step": 15150, + "task_loss": 0.42168617248535156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04748267322867502, + "compression/movement_sparsity/importance_threshold": -0.23861187943627127, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15821881592273712, + "epoch": 5.48, + "learning_rate": 3.7305010617501245e-08, + "loss": 0.1747, + "step": 15160, + "task_loss": 0.25698161125183105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04752464428365478, + "compression/movement_sparsity/importance_threshold": -0.23804017163853597, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.166069895029068, + "epoch": 5.48, + "learning_rate": 3.679469975594385e-08, + "loss": 0.1685, + "step": 15170, + "task_loss": 0.4625723361968994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04756654824400414, + "compression/movement_sparsity/importance_threshold": -0.2374693777688256, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17632624506950378, + "epoch": 5.49, + "learning_rate": 3.6287837929854795e-08, + "loss": 0.1778, + "step": 15180, + "task_loss": 0.36426687240600586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047608385163394565, + "compression/movement_sparsity/importance_threshold": -0.23689949709605584, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16632002592086792, + "epoch": 5.49, + "learning_rate": 3.578442695419925e-08, + "loss": 0.1632, + "step": 15190, + "task_loss": 0.4585450291633606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0476501550954974, + "compression/movement_sparsity/importance_threshold": -0.2363305288891432, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20258350670337677, + "epoch": 5.49, + "learning_rate": 3.528446863158641e-08, + "loss": 0.1867, + "step": 15200, + "task_loss": 0.41815778613090515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047691858093984116, + "compression/movement_sparsity/importance_threshold": -0.2357624724170032, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19658106565475464, + "epoch": 5.5, + "learning_rate": 3.4787964752261536e-08, + "loss": 0.1876, + "step": 15210, + "task_loss": 0.5894261002540588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04773349421252606, + "compression/movement_sparsity/importance_threshold": -0.2351953269485526, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1922135055065155, + "epoch": 5.5, + "learning_rate": 3.4294917094100484e-08, + "loss": 0.1867, + "step": 15220, + "task_loss": 0.4709359407424927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04777506350479469, + "compression/movement_sparsity/importance_threshold": -0.23462909175270685, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14158789813518524, + "epoch": 5.5, + "learning_rate": 3.380532742260334e-08, + "loss": 0.1907, + "step": 15230, + "task_loss": 0.7388123869895935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04781656602446141, + "compression/movement_sparsity/importance_threshold": -0.23406376609838242, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21245448291301727, + "epoch": 5.51, + "learning_rate": 3.331919749088763e-08, + "loss": 0.1904, + "step": 15240, + "task_loss": 0.6762509346008301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047858001825197616, + "compression/movement_sparsity/importance_threshold": -0.23349934925449511, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16487757861614227, + "epoch": 5.51, + "learning_rate": 3.283652903968237e-08, + "loss": 0.1809, + "step": 15250, + "task_loss": 0.6355116367340088 + }, + { + "epoch": 5.51, + "eval_exact_match": 83.60454115421003, + "eval_f1": 90.05225316670094, + "step": 15250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04789937096067474, + "compression/movement_sparsity/importance_threshold": -0.23293584048996108, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13231196999549866, + "epoch": 5.51, + "learning_rate": 3.235732379732148e-08, + "loss": 0.1582, + "step": 15260, + "task_loss": 0.23886415362358093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04794067348456416, + "compression/movement_sparsity/importance_threshold": -0.23237323907369634, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17306077480316162, + "epoch": 5.52, + "learning_rate": 3.188158347973846e-08, + "loss": 0.1806, + "step": 15270, + "task_loss": 0.5274963974952698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047981909450537305, + "compression/movement_sparsity/importance_threshold": -0.23181154427461725, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1836722195148468, + "epoch": 5.52, + "learning_rate": 3.140930979045886e-08, + "loss": 0.1796, + "step": 15280, + "task_loss": 0.443234384059906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0480230789122656, + "compression/movement_sparsity/importance_threshold": -0.23125075536163953, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14876201748847961, + "epoch": 5.53, + "learning_rate": 3.094050442059559e-08, + "loss": 0.1717, + "step": 15290, + "task_loss": 0.7542725801467896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04806418192342043, + "compression/movement_sparsity/importance_threshold": -0.23069087160367951, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16534817218780518, + "epoch": 5.53, + "learning_rate": 3.047516904884206e-08, + "loss": 0.1689, + "step": 15300, + "task_loss": 0.4024174213409424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048105218537673224, + "compression/movement_sparsity/importance_threshold": -0.23013189226965303, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19905927777290344, + "epoch": 5.53, + "learning_rate": 3.0013305341466066e-08, + "loss": 0.1737, + "step": 15310, + "task_loss": 0.5906176567077637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048146188808695395, + "compression/movement_sparsity/importance_threshold": -0.22957381662847598, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.174982950091362, + "epoch": 5.54, + "learning_rate": 2.9554914952304665e-08, + "loss": 0.1761, + "step": 15320, + "task_loss": 0.45582154393196106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04818709279015833, + "compression/movement_sparsity/importance_threshold": -0.22901664394906496, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17691166698932648, + "epoch": 5.54, + "learning_rate": 2.9099999522757103e-08, + "loss": 0.1961, + "step": 15330, + "task_loss": 0.5763674974441528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04822793053573346, + "compression/movement_sparsity/importance_threshold": -0.22846037350033566, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19818875193595886, + "epoch": 5.54, + "learning_rate": 2.86485606817799e-08, + "loss": 0.1801, + "step": 15340, + "task_loss": 0.6044281721115112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04826870209909218, + "compression/movement_sparsity/importance_threshold": -0.22790500455120444, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13256880640983582, + "epoch": 5.55, + "learning_rate": 2.820060004588054e-08, + "loss": 0.1709, + "step": 15350, + "task_loss": 0.4946630001068115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048309407533905925, + "compression/movement_sparsity/importance_threshold": -0.227350536370587, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.177134171128273, + "epoch": 5.55, + "learning_rate": 2.7756119219111805e-08, + "loss": 0.1756, + "step": 15360, + "task_loss": 0.6153013706207275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04835004689384608, + "compression/movement_sparsity/importance_threshold": -0.2267969682273997, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15522846579551697, + "epoch": 5.55, + "learning_rate": 2.7315119793065998e-08, + "loss": 0.1707, + "step": 15370, + "task_loss": 0.27729225158691406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04839062023258407, + "compression/movement_sparsity/importance_threshold": -0.22624429939055846, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16957354545593262, + "epoch": 5.56, + "learning_rate": 2.687760334686917e-08, + "loss": 0.1731, + "step": 15380, + "task_loss": 0.6803514361381531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048431127603791306, + "compression/movement_sparsity/importance_threshold": -0.2256925291289793, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14271289110183716, + "epoch": 5.56, + "learning_rate": 2.6443571447175795e-08, + "loss": 0.1669, + "step": 15390, + "task_loss": 0.5242533087730408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04847156906113919, + "compression/movement_sparsity/importance_threshold": -0.2251416567115785, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1309148520231247, + "epoch": 5.57, + "learning_rate": 2.6013025648162546e-08, + "loss": 0.1711, + "step": 15400, + "task_loss": 0.47524943947792053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04851194465829914, + "compression/movement_sparsity/importance_threshold": -0.22459168140727181, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14253975450992584, + "epoch": 5.57, + "learning_rate": 2.558596749152342e-08, + "loss": 0.1968, + "step": 15410, + "task_loss": 0.4356352686882019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04855225444894257, + "compression/movement_sparsity/importance_threshold": -0.22404260248497565, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17380890250205994, + "epoch": 5.57, + "learning_rate": 2.5162398506463957e-08, + "loss": 0.1748, + "step": 15420, + "task_loss": 0.48908424377441406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04859249848674087, + "compression/movement_sparsity/importance_threshold": -0.2234944192136059, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1803099513053894, + "epoch": 5.58, + "learning_rate": 2.4742320209695245e-08, + "loss": 0.1778, + "step": 15430, + "task_loss": 0.35538819432258606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04863267682536546, + "compression/movement_sparsity/importance_threshold": -0.22294713086207874, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14477093517780304, + "epoch": 5.58, + "learning_rate": 2.4325734105429486e-08, + "loss": 0.1798, + "step": 15440, + "task_loss": 0.3041430115699768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048672789518487775, + "compression/movement_sparsity/importance_threshold": -0.22240073669930993, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16156978905200958, + "epoch": 5.58, + "learning_rate": 2.391264168537377e-08, + "loss": 0.19, + "step": 15450, + "task_loss": 0.4806953966617584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048712836619779196, + "compression/movement_sparsity/importance_threshold": -0.22185523599421586, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20278826355934143, + "epoch": 5.59, + "learning_rate": 2.350304442872497e-08, + "loss": 0.1881, + "step": 15460, + "task_loss": 0.7880595326423645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048752818182911134, + "compression/movement_sparsity/importance_threshold": -0.22131062801571266, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15064328908920288, + "epoch": 5.59, + "learning_rate": 2.309694380216487e-08, + "loss": 0.1644, + "step": 15470, + "task_loss": 0.37797996401786804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04879273426155503, + "compression/movement_sparsity/importance_threshold": -0.2207669120327158, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15130218863487244, + "epoch": 5.59, + "learning_rate": 2.2694341259854366e-08, + "loss": 0.1658, + "step": 15480, + "task_loss": 0.5249390602111816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048832584909382255, + "compression/movement_sparsity/importance_threshold": -0.22022408731414223, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15106838941574097, + "epoch": 5.6, + "learning_rate": 2.2295238243428384e-08, + "loss": 0.1679, + "step": 15490, + "task_loss": 0.43280619382858276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048872370180064253, + "compression/movement_sparsity/importance_threshold": -0.21968215312890738, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.161295086145401, + "epoch": 5.6, + "learning_rate": 2.1899636181990644e-08, + "loss": 0.1777, + "step": 15500, + "task_loss": 0.22472208738327026 + }, + { + "epoch": 5.6, + "eval_exact_match": 83.52885525070955, + "eval_f1": 89.91861288595577, + "step": 15500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04891209012727241, + "compression/movement_sparsity/importance_threshold": -0.2191411087459274, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15865418314933777, + "epoch": 5.61, + "learning_rate": 2.1507536492109123e-08, + "loss": 0.1714, + "step": 15510, + "task_loss": 0.3335818648338318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048951744804678156, + "compression/movement_sparsity/importance_threshold": -0.21860095343411856, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16958612203598022, + "epoch": 5.61, + "learning_rate": 2.1118940577810274e-08, + "loss": 0.1739, + "step": 15520, + "task_loss": 0.4664156436920166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04899133426595289, + "compression/movement_sparsity/importance_threshold": -0.21806168646239676, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1692367047071457, + "epoch": 5.61, + "learning_rate": 2.0733849830574135e-08, + "loss": 0.1765, + "step": 15530, + "task_loss": 0.6259729862213135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04903085856476802, + "compression/movement_sparsity/importance_threshold": -0.21752330709967815, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1735900342464447, + "epoch": 5.62, + "learning_rate": 2.0352265629329678e-08, + "loss": 0.192, + "step": 15540, + "task_loss": 0.343353807926178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049070317754794965, + "compression/movement_sparsity/importance_threshold": -0.21698581461487876, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13655024766921997, + "epoch": 5.62, + "learning_rate": 1.997418934044959e-08, + "loss": 0.1741, + "step": 15550, + "task_loss": 0.16692593693733215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049109711889705136, + "compression/movement_sparsity/importance_threshold": -0.2164492082769146, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14855730533599854, + "epoch": 5.62, + "learning_rate": 1.95996223177457e-08, + "loss": 0.1727, + "step": 15560, + "task_loss": 0.46668440103530884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04914904102316992, + "compression/movement_sparsity/importance_threshold": -0.21591348735470217, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14958634972572327, + "epoch": 5.63, + "learning_rate": 1.9228565902463356e-08, + "loss": 0.1707, + "step": 15570, + "task_loss": 0.32905149459838867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04918830520886077, + "compression/movement_sparsity/importance_threshold": -0.2153786511171567, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15305137634277344, + "epoch": 5.63, + "learning_rate": 1.8861021423277722e-08, + "loss": 0.1729, + "step": 15580, + "task_loss": 0.45828777551651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049227504500449056, + "compression/movement_sparsity/importance_threshold": -0.21484469883319512, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1438327133655548, + "epoch": 5.63, + "learning_rate": 1.8496990196288143e-08, + "loss": 0.1723, + "step": 15590, + "task_loss": 0.28205084800720215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04926663895160622, + "compression/movement_sparsity/importance_threshold": -0.2143116297717329, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19014272093772888, + "epoch": 5.64, + "learning_rate": 1.8136473525013907e-08, + "loss": 0.1907, + "step": 15600, + "task_loss": 0.6441313624382019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04930570861600365, + "compression/movement_sparsity/importance_threshold": -0.2137794432016863, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1672857701778412, + "epoch": 5.64, + "learning_rate": 1.7779472700389265e-08, + "loss": 0.1924, + "step": 15610, + "task_loss": 0.4105015695095062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04934471354731277, + "compression/movement_sparsity/importance_threshold": -0.21324813839197143, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1488228738307953, + "epoch": 5.65, + "learning_rate": 1.742598900075909e-08, + "loss": 0.1778, + "step": 15620, + "task_loss": 0.41871628165245056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04938365379920498, + "compression/movement_sparsity/importance_threshold": -0.21271771461150446, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19621488451957703, + "epoch": 5.65, + "learning_rate": 1.7076023691874e-08, + "loss": 0.1715, + "step": 15630, + "task_loss": 0.36311233043670654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04942252942535171, + "compression/movement_sparsity/importance_threshold": -0.2121881711292012, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14452539384365082, + "epoch": 5.65, + "learning_rate": 1.6729578026886347e-08, + "loss": 0.1759, + "step": 15640, + "task_loss": 0.3132791519165039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049461340479424354, + "compression/movement_sparsity/importance_threshold": -0.21165950721397775, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20849350094795227, + "epoch": 5.66, + "learning_rate": 1.6386653246344916e-08, + "loss": 0.1801, + "step": 15650, + "task_loss": 0.4664916396141052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04950008701509432, + "compression/movement_sparsity/importance_threshold": -0.2111317221347504, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14356830716133118, + "epoch": 5.66, + "learning_rate": 1.6047250578191342e-08, + "loss": 0.1875, + "step": 15660, + "task_loss": 0.33819547295570374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049538769086033024, + "compression/movement_sparsity/importance_threshold": -0.21060481516043505, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17026101052761078, + "epoch": 5.66, + "learning_rate": 1.571137123775501e-08, + "loss": 0.198, + "step": 15670, + "task_loss": 0.28901436924934387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04957738674591188, + "compression/movement_sparsity/importance_threshold": -0.21007878555994786, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18996986746788025, + "epoch": 5.67, + "learning_rate": 1.5379016427749193e-08, + "loss": 0.1702, + "step": 15680, + "task_loss": 0.3736341595649719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0496159400484023, + "compression/movement_sparsity/importance_threshold": -0.20955363260220472, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16425611078739166, + "epoch": 5.67, + "learning_rate": 1.5050187338266574e-08, + "loss": 0.1707, + "step": 15690, + "task_loss": 0.32322365045547485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04965442904717568, + "compression/movement_sparsity/importance_threshold": -0.2090293555561219, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17800205945968628, + "epoch": 5.67, + "learning_rate": 1.4724885146774834e-08, + "loss": 0.1811, + "step": 15700, + "task_loss": 0.4337901771068573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04969285379590344, + "compression/movement_sparsity/importance_threshold": -0.20850595369061542, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15589317679405212, + "epoch": 5.68, + "learning_rate": 1.4403111018112645e-08, + "loss": 0.1792, + "step": 15710, + "task_loss": 0.357605516910553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04973121434825701, + "compression/movement_sparsity/importance_threshold": -0.2079834262746012, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16989806294441223, + "epoch": 5.68, + "learning_rate": 1.408486610448567e-08, + "loss": 0.1705, + "step": 15720, + "task_loss": 0.5997334718704224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04976951075790777, + "compression/movement_sparsity/importance_threshold": -0.2074617725769956, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15032757818698883, + "epoch": 5.68, + "learning_rate": 1.3770151545461683e-08, + "loss": 0.1719, + "step": 15730, + "task_loss": 0.4771418273448944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04980774307852714, + "compression/movement_sparsity/importance_threshold": -0.20694099186671433, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15643621981143951, + "epoch": 5.69, + "learning_rate": 1.3458968467967457e-08, + "loss": 0.1763, + "step": 15740, + "task_loss": 0.41822129487991333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049845911363786535, + "compression/movement_sparsity/importance_threshold": -0.20642108341267373, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12617532908916473, + "epoch": 5.69, + "learning_rate": 1.3151317986283994e-08, + "loss": 0.1741, + "step": 15750, + "task_loss": 0.5501689910888672 + }, + { + "epoch": 5.69, + "eval_exact_match": 83.65184484389782, + "eval_f1": 89.99673268802798, + "step": 15750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049884015667357354, + "compression/movement_sparsity/importance_threshold": -0.20590204648378985, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14693979918956757, + "epoch": 5.7, + "learning_rate": 1.2847201202043079e-08, + "loss": 0.1932, + "step": 15760, + "task_loss": 0.44467893242836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049922056042911035, + "compression/movement_sparsity/importance_threshold": -0.20538388034897848, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15097220242023468, + "epoch": 5.7, + "learning_rate": 1.254661920422273e-08, + "loss": 0.1731, + "step": 15770, + "task_loss": 0.2182091921567917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04996003254411896, + "compression/movement_sparsity/importance_threshold": -0.20486658427715598, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17072612047195435, + "epoch": 5.7, + "learning_rate": 1.2249573069143981e-08, + "loss": 0.1689, + "step": 15780, + "task_loss": 0.35801267623901367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04999794522465255, + "compression/movement_sparsity/importance_threshold": -0.2043501575372384, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13755616545677185, + "epoch": 5.71, + "learning_rate": 1.1956063860466436e-08, + "loss": 0.173, + "step": 15790, + "task_loss": 0.3809490203857422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05003579413818322, + "compression/movement_sparsity/importance_threshold": -0.20383459939814164, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17082442343235016, + "epoch": 5.71, + "learning_rate": 1.1666092629184831e-08, + "loss": 0.1782, + "step": 15800, + "task_loss": 0.571792721748352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05007357933838238, + "compression/movement_sparsity/importance_threshold": -0.20331990912878184, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18530184030532837, + "epoch": 5.71, + "learning_rate": 1.1379660413625037e-08, + "loss": 0.183, + "step": 15810, + "task_loss": 0.5109858512878418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05011130087892143, + "compression/movement_sparsity/importance_threshold": -0.20280608599807504, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14845629036426544, + "epoch": 5.72, + "learning_rate": 1.1096768239440612e-08, + "loss": 0.1628, + "step": 15820, + "task_loss": 0.2701829671859741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0501489588134718, + "compression/movement_sparsity/importance_threshold": -0.20229312927493737, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16749542951583862, + "epoch": 5.72, + "learning_rate": 1.081741711960893e-08, + "loss": 0.1836, + "step": 15830, + "task_loss": 0.4017007350921631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05018655319570488, + "compression/movement_sparsity/importance_threshold": -0.20178103822828497, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1643616259098053, + "epoch": 5.72, + "learning_rate": 1.0541608054427386e-08, + "loss": 0.1929, + "step": 15840, + "task_loss": 0.4987179636955261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05022408407929209, + "compression/movement_sparsity/importance_threshold": -0.20126981212703376, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16851231455802917, + "epoch": 5.73, + "learning_rate": 1.0269342031510531e-08, + "loss": 0.1722, + "step": 15850, + "task_loss": 0.3005487620830536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05026155151790483, + "compression/movement_sparsity/importance_threshold": -0.2007594502400999, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17689800262451172, + "epoch": 5.73, + "learning_rate": 1.000062002578539e-08, + "loss": 0.1869, + "step": 15860, + "task_loss": 0.1859908550977707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05029895556521454, + "compression/movement_sparsity/importance_threshold": -0.20024995183639926, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20160090923309326, + "epoch": 5.74, + "learning_rate": 9.73544299948903e-09, + "loss": 0.1833, + "step": 15870, + "task_loss": 0.7696638107299805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05033629627489259, + "compression/movement_sparsity/importance_threshold": -0.19974131618484825, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17273235321044922, + "epoch": 5.74, + "learning_rate": 9.473811902164564e-09, + "loss": 0.1757, + "step": 15880, + "task_loss": 0.5132439136505127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05037357370061043, + "compression/movement_sparsity/importance_threshold": -0.19923354255436265, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1815074384212494, + "epoch": 5.74, + "learning_rate": 9.215727670657813e-09, + "loss": 0.1809, + "step": 15890, + "task_loss": 0.47850722074508667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05041078789603944, + "compression/movement_sparsity/importance_threshold": -0.1987266302138586, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15570800006389618, + "epoch": 5.75, + "learning_rate": 8.961191229114317e-09, + "loss": 0.1657, + "step": 15900, + "task_loss": 0.38322216272354126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.050447938914851054, + "compression/movement_sparsity/importance_threshold": -0.19822057843225227, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14382407069206238, + "epoch": 5.75, + "learning_rate": 8.710203488975221e-09, + "loss": 0.1711, + "step": 15910, + "task_loss": 0.3041207194328308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05048502681071666, + "compression/movement_sparsity/importance_threshold": -0.19771538647845965, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.10968972742557526, + "epoch": 5.75, + "learning_rate": 8.462765348974943e-09, + "loss": 0.1669, + "step": 15920, + "task_loss": 0.47378888726234436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.050522051637307684, + "compression/movement_sparsity/importance_threshold": -0.1972110536213968, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17376708984375, + "epoch": 5.76, + "learning_rate": 8.218877695137294e-09, + "loss": 0.1801, + "step": 15930, + "task_loss": 0.42871758341789246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05055901344829554, + "compression/movement_sparsity/importance_threshold": -0.19670757912997971, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17503592371940613, + "epoch": 5.76, + "learning_rate": 7.97854140077281e-09, + "loss": 0.1781, + "step": 15940, + "task_loss": 0.3418278694152832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05059591229735162, + "compression/movement_sparsity/importance_threshold": -0.19620496227312456, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19280436635017395, + "epoch": 5.76, + "learning_rate": 7.741757326475195e-09, + "loss": 0.1785, + "step": 15950, + "task_loss": 0.5458291172981262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05063274823814735, + "compression/movement_sparsity/importance_threshold": -0.19570320231974747, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1474490612745285, + "epoch": 5.77, + "learning_rate": 7.508526320118114e-09, + "loss": 0.1703, + "step": 15960, + "task_loss": 0.31817954778671265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.050669521324354136, + "compression/movement_sparsity/importance_threshold": -0.19520229853876425, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22078892588615417, + "epoch": 5.77, + "learning_rate": 7.2788492168529556e-09, + "loss": 0.1843, + "step": 15970, + "task_loss": 0.45727652311325073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05070623160964339, + "compression/movement_sparsity/importance_threshold": -0.19470225019909138, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14431482553482056, + "epoch": 5.78, + "learning_rate": 7.052726839105072e-09, + "loss": 0.1673, + "step": 15980, + "task_loss": 0.7761343717575073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05074287914768652, + "compression/movement_sparsity/importance_threshold": -0.19420305656964454, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1601599156856537, + "epoch": 5.78, + "learning_rate": 6.830159996570883e-09, + "loss": 0.1785, + "step": 15990, + "task_loss": 0.41845470666885376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05077946399215494, + "compression/movement_sparsity/importance_threshold": -0.19370471691933988, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18368986248970032, + "epoch": 5.78, + "learning_rate": 6.611149486215772e-09, + "loss": 0.1674, + "step": 16000, + "task_loss": 0.35329797863960266 + }, + { + "epoch": 5.78, + "eval_exact_match": 83.65184484389782, + "eval_f1": 90.04175695740012, + "step": 16000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05081598619672006, + "compression/movement_sparsity/importance_threshold": -0.19320723051709365, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13221392035484314, + "epoch": 5.79, + "learning_rate": 6.395696092269975e-09, + "loss": 0.1698, + "step": 16010, + "task_loss": 0.43325942754745483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05085244581505328, + "compression/movement_sparsity/importance_threshold": -0.19271059663182188, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16434437036514282, + "epoch": 5.79, + "learning_rate": 6.183800586226917e-09, + "loss": 0.1674, + "step": 16020, + "task_loss": 0.47905054688453674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05088884290082603, + "compression/movement_sparsity/importance_threshold": -0.19221481453244038, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12508782744407654, + "epoch": 5.79, + "learning_rate": 5.975463726839769e-09, + "loss": 0.1762, + "step": 16030, + "task_loss": 0.16149437427520752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05092517750770971, + "compression/movement_sparsity/importance_threshold": -0.1917198834878654, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13816924393177032, + "epoch": 5.8, + "learning_rate": 5.7706862601188956e-09, + "loss": 0.1769, + "step": 16040, + "task_loss": 0.4390493631362915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.050961449689375715, + "compression/movement_sparsity/importance_threshold": -0.19122580276701318, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15949416160583496, + "epoch": 5.8, + "learning_rate": 5.569468919329412e-09, + "loss": 0.1825, + "step": 16050, + "task_loss": 0.376995712518692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05099765949949549, + "compression/movement_sparsity/importance_threshold": -0.19073257163879942, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17084449529647827, + "epoch": 5.8, + "learning_rate": 5.371812424988298e-09, + "loss": 0.1777, + "step": 16060, + "task_loss": 0.5042399168014526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05103380699174041, + "compression/movement_sparsity/importance_threshold": -0.1902401893721405, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14627447724342346, + "epoch": 5.81, + "learning_rate": 5.177717484861843e-09, + "loss": 0.1862, + "step": 16070, + "task_loss": 0.8221181631088257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051069892219781914, + "compression/movement_sparsity/importance_threshold": -0.1897486552359523, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1746702343225479, + "epoch": 5.81, + "learning_rate": 4.987184793962984e-09, + "loss": 0.1725, + "step": 16080, + "task_loss": 0.7324734926223755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05110591523729141, + "compression/movement_sparsity/importance_threshold": -0.18925796849915089, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1688476949930191, + "epoch": 5.81, + "learning_rate": 4.800215034549527e-09, + "loss": 0.183, + "step": 16090, + "task_loss": 0.46025651693344116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051141876097940284, + "compression/movement_sparsity/importance_threshold": -0.1887681284306526, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1825210154056549, + "epoch": 5.82, + "learning_rate": 4.616808876120592e-09, + "loss": 0.1837, + "step": 16100, + "task_loss": 0.3381309509277344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051177774855399975, + "compression/movement_sparsity/importance_threshold": -0.18827913429937304, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17920146882534027, + "epoch": 5.82, + "learning_rate": 4.4369669754150686e-09, + "loss": 0.188, + "step": 16110, + "task_loss": 0.5360010862350464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05121361156334188, + "compression/movement_sparsity/importance_threshold": -0.18779098537422856, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16270099580287933, + "epoch": 5.83, + "learning_rate": 4.260689976408938e-09, + "loss": 0.1853, + "step": 16120, + "task_loss": 0.35809409618377686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0512493862754374, + "compression/movement_sparsity/importance_threshold": -0.1873036809241353, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15442490577697754, + "epoch": 5.83, + "learning_rate": 4.087978510313173e-09, + "loss": 0.1841, + "step": 16130, + "task_loss": 0.3518342971801758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05128509904535796, + "compression/movement_sparsity/importance_threshold": -0.18681722021800928, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13016200065612793, + "epoch": 5.83, + "learning_rate": 3.91883319557107e-09, + "loss": 0.1738, + "step": 16140, + "task_loss": 0.3382733464241028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05132074992677497, + "compression/movement_sparsity/importance_threshold": -0.18633160252476644, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2215452790260315, + "epoch": 5.84, + "learning_rate": 3.753254637856362e-09, + "loss": 0.1911, + "step": 16150, + "task_loss": 0.3834210932254791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05135633897335984, + "compression/movement_sparsity/importance_threshold": -0.1858468271133228, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1495056003332138, + "epoch": 5.84, + "learning_rate": 3.5912434300711113e-09, + "loss": 0.1791, + "step": 16160, + "task_loss": 0.2563254237174988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05139186623878396, + "compression/movement_sparsity/importance_threshold": -0.1853628932525948, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1634816825389862, + "epoch": 5.84, + "learning_rate": 3.4328001523432625e-09, + "loss": 0.1836, + "step": 16170, + "task_loss": 0.49831482768058777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05142733177671878, + "compression/movement_sparsity/importance_threshold": -0.18487980021149808, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18057052791118622, + "epoch": 5.85, + "learning_rate": 3.277925372024981e-09, + "loss": 0.1897, + "step": 16180, + "task_loss": 0.4678457975387573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051462735640835686, + "compression/movement_sparsity/importance_threshold": -0.18439754725894886, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15450020134449005, + "epoch": 5.85, + "learning_rate": 3.1266196436902092e-09, + "loss": 0.1728, + "step": 16190, + "task_loss": 0.3961532711982727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051498077884806086, + "compression/movement_sparsity/importance_threshold": -0.18391613366386328, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1460452675819397, + "epoch": 5.85, + "learning_rate": 2.9788835091328902e-09, + "loss": 0.1675, + "step": 16200, + "task_loss": 0.3941865563392639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051533358562301394, + "compression/movement_sparsity/importance_threshold": -0.18343555869515726, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18922826647758484, + "epoch": 5.86, + "learning_rate": 2.834717497364969e-09, + "loss": 0.1761, + "step": 16210, + "task_loss": 0.6385120153427124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05156857772699304, + "compression/movement_sparsity/importance_threshold": -0.18295582162174695, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16979092359542847, + "epoch": 5.86, + "learning_rate": 2.6941221246147283e-09, + "loss": 0.1839, + "step": 16220, + "task_loss": 0.4651219844818115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0516037354325524, + "compression/movement_sparsity/importance_threshold": -0.1824769217125486, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21407416462898254, + "epoch": 5.87, + "learning_rate": 2.557097894324567e-09, + "loss": 0.1886, + "step": 16230, + "task_loss": 0.60257887840271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0516388317326509, + "compression/movement_sparsity/importance_threshold": -0.181998858236478, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19913232326507568, + "epoch": 5.87, + "learning_rate": 2.4236452971493348e-09, + "loss": 0.1795, + "step": 16240, + "task_loss": 0.44450339674949646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05167386668095997, + "compression/movement_sparsity/importance_threshold": -0.1815216304624513, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17576679587364197, + "epoch": 5.87, + "learning_rate": 2.2937648109547793e-09, + "loss": 0.1725, + "step": 16250, + "task_loss": 0.3722856938838959 + }, + { + "epoch": 5.87, + "eval_exact_match": 83.62346263008514, + "eval_f1": 90.07747022728749, + "step": 16250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05170884033115099, + "compression/movement_sparsity/importance_threshold": -0.18104523765938463, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16510339081287384, + "epoch": 5.88, + "learning_rate": 2.167456900815545e-09, + "loss": 0.1934, + "step": 16260, + "task_loss": 0.5060644149780273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051743752736895396, + "compression/movement_sparsity/importance_threshold": -0.18056967909619392, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15950855612754822, + "epoch": 5.88, + "learning_rate": 2.0447220190136225e-09, + "loss": 0.1809, + "step": 16270, + "task_loss": 0.5934314727783203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05177860395186458, + "compression/movement_sparsity/importance_threshold": -0.18009495404179532, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18966922163963318, + "epoch": 5.88, + "learning_rate": 1.9255606050369024e-09, + "loss": 0.1852, + "step": 16280, + "task_loss": 0.4227147698402405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05181339402972995, + "compression/movement_sparsity/importance_threshold": -0.17962106176510495, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18544501066207886, + "epoch": 5.89, + "learning_rate": 1.8099730855773986e-09, + "loss": 0.1664, + "step": 16290, + "task_loss": 0.34008803963661194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05184812302416294, + "compression/movement_sparsity/importance_threshold": -0.17914800153503885, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16752862930297852, + "epoch": 5.89, + "learning_rate": 1.6979598745294754e-09, + "loss": 0.1778, + "step": 16300, + "task_loss": 0.417269229888916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051882790988834936, + "compression/movement_sparsity/importance_threshold": -0.17867577262051304, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15857253968715668, + "epoch": 5.89, + "learning_rate": 1.5895213729889555e-09, + "loss": 0.1804, + "step": 16310, + "task_loss": 0.773322582244873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051917397977417366, + "compression/movement_sparsity/importance_threshold": -0.17820437429044367, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1497032344341278, + "epoch": 5.9, + "learning_rate": 1.484657969251346e-09, + "loss": 0.1761, + "step": 16320, + "task_loss": 0.3770208954811096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.051951944043581624, + "compression/movement_sparsity/importance_threshold": -0.17773380581374676, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11823238432407379, + "epoch": 5.9, + "learning_rate": 1.3833700388103943e-09, + "loss": 0.1767, + "step": 16330, + "task_loss": 0.2198001593351364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05198642924099913, + "compression/movement_sparsity/importance_threshold": -0.17726406645933845, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16819697618484497, + "epoch": 5.91, + "learning_rate": 1.285657944356977e-09, + "loss": 0.1708, + "step": 16340, + "task_loss": 0.35815563797950745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.052020853623341315, + "compression/movement_sparsity/importance_threshold": -0.17679515549613445, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15140169858932495, + "epoch": 5.91, + "learning_rate": 1.1915220357772149e-09, + "loss": 0.1769, + "step": 16350, + "task_loss": 0.5908142328262329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05205521724427954, + "compression/movement_sparsity/importance_threshold": -0.17632707219305144, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14047864079475403, + "epoch": 5.91, + "learning_rate": 1.1009626501523595e-09, + "loss": 0.1804, + "step": 16360, + "task_loss": 0.31691184639930725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05208952015748527, + "compression/movement_sparsity/importance_threshold": -0.175859815819005, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1407729983329773, + "epoch": 5.92, + "learning_rate": 1.0139801117562408e-09, + "loss": 0.1801, + "step": 16370, + "task_loss": 0.5783429145812988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05212376241662988, + "compression/movement_sparsity/importance_threshold": -0.1753933856429113, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1462913155555725, + "epoch": 5.92, + "learning_rate": 9.30574732055156e-10, + "loss": 0.179, + "step": 16380, + "task_loss": 0.5032361149787903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.052157944075384786, + "compression/movement_sparsity/importance_threshold": -0.17492778093368655, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16070584952831268, + "epoch": 5.92, + "learning_rate": 8.507468097062043e-10, + "loss": 0.1658, + "step": 16390, + "task_loss": 0.27588170766830444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05219206518742141, + "compression/movement_sparsity/importance_threshold": -0.17446300096024658, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18269900977611542, + "epoch": 5.93, + "learning_rate": 7.744966305563982e-10, + "loss": 0.1777, + "step": 16400, + "task_loss": 0.3831246495246887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05222612580641116, + "compression/movement_sparsity/importance_threshold": -0.17399904499150776, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1743420958518982, + "epoch": 5.93, + "learning_rate": 7.018244676415541e-10, + "loss": 0.1687, + "step": 16410, + "task_loss": 0.460012823343277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05226012598602544, + "compression/movement_sparsity/importance_threshold": -0.17353591229638587, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15871578454971313, + "epoch": 5.93, + "learning_rate": 6.327305811852923e-10, + "loss": 0.1759, + "step": 16420, + "task_loss": 0.3666446805000305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05229406577993566, + "compression/movement_sparsity/importance_threshold": -0.17307360214379708, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12524762749671936, + "epoch": 5.94, + "learning_rate": 5.672152185983714e-10, + "loss": 0.1674, + "step": 16430, + "task_loss": 0.5719473361968994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05232794524181323, + "compression/movement_sparsity/importance_threshold": -0.17261211380265762, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13641473650932312, + "epoch": 5.94, + "learning_rate": 5.052786144775778e-10, + "loss": 0.1772, + "step": 16440, + "task_loss": 0.491558313369751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.052361764425329575, + "compression/movement_sparsity/importance_threshold": -0.1721514465418833, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13365453481674194, + "epoch": 5.95, + "learning_rate": 4.469209906048377e-10, + "loss": 0.1727, + "step": 16450, + "task_loss": 0.2934437394142151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05239552338415609, + "compression/movement_sparsity/importance_threshold": -0.17169159963039038, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14834952354431152, + "epoch": 5.95, + "learning_rate": 3.921425559463287e-10, + "loss": 0.1874, + "step": 16460, + "task_loss": 0.4384889304637909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05242922217196419, + "compression/movement_sparsity/importance_threshold": -0.17123257233709488, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15647485852241516, + "epoch": 5.95, + "learning_rate": 3.4094350665236913e-10, + "loss": 0.1962, + "step": 16470, + "task_loss": 0.5159170031547546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05246286084242529, + "compression/movement_sparsity/importance_threshold": -0.17077436393091272, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14214615523815155, + "epoch": 5.96, + "learning_rate": 2.933240260558634e-10, + "loss": 0.1902, + "step": 16480, + "task_loss": 0.31043171882629395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05249643944921079, + "compression/movement_sparsity/importance_threshold": -0.17031697368076015, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15785396099090576, + "epoch": 5.96, + "learning_rate": 2.4928428467207997e-10, + "loss": 0.1898, + "step": 16490, + "task_loss": 0.2698136568069458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.052529958045992114, + "compression/movement_sparsity/importance_threshold": -0.1698604008555531, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1571805626153946, + "epoch": 5.96, + "learning_rate": 2.0882444019809653e-10, + "loss": 0.179, + "step": 16500, + "task_loss": 0.29651594161987305 + }, + { + "epoch": 5.96, + "eval_exact_match": 83.73699148533585, + "eval_f1": 90.03229638831709, + "step": 16500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05256341668644067, + "compression/movement_sparsity/importance_threshold": -0.1694046447242078, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1439880132675171, + "epoch": 5.97, + "learning_rate": 1.719446375121336e-10, + "loss": 0.1733, + "step": 16510, + "task_loss": 0.4038183093070984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05259681542422786, + "compression/movement_sparsity/importance_threshold": -0.16894970455564018, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14894336462020874, + "epoch": 5.97, + "learning_rate": 1.3864500867311057e-10, + "loss": 0.1919, + "step": 16520, + "task_loss": 0.32719749212265015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0526301543130251, + "compression/movement_sparsity/importance_threshold": -0.16849557961876638, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13948574662208557, + "epoch": 5.97, + "learning_rate": 1.089256729197574e-10, + "loss": 0.1815, + "step": 16530, + "task_loss": 0.6157231330871582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0526634334065038, + "compression/movement_sparsity/importance_threshold": -0.16804226918250242, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15811459720134735, + "epoch": 5.98, + "learning_rate": 8.278673667094783e-11, + "loss": 0.169, + "step": 16540, + "task_loss": 0.3574429750442505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05269665275833536, + "compression/movement_sparsity/importance_threshold": -0.16758977251576435, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17332813143730164, + "epoch": 5.98, + "learning_rate": 6.022829352458902e-11, + "loss": 0.1862, + "step": 16550, + "task_loss": 0.6019191741943359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05272981242219122, + "compression/movement_sparsity/importance_threshold": -0.16713808888746828, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15140379965305328, + "epoch": 5.98, + "learning_rate": 4.125042425784375e-11, + "loss": 0.1758, + "step": 16560, + "task_loss": 0.3916362524032593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05276291245174276, + "compression/movement_sparsity/importance_threshold": -0.16668721756653027, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18484221398830414, + "epoch": 5.99, + "learning_rate": 2.5853196826353118e-11, + "loss": 0.1708, + "step": 16570, + "task_loss": 0.5214443206787109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05279595290066141, + "compression/movement_sparsity/importance_threshold": -0.16623715782186643, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14860549569129944, + "epoch": 5.99, + "learning_rate": 1.403666636445866e-11, + "loss": 0.1672, + "step": 16580, + "task_loss": 0.4291364848613739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.052828933822618565, + "compression/movement_sparsity/importance_threshold": -0.1657879089223927, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1612284928560257, + "epoch": 6.0, + "learning_rate": 5.8008751845362024e-12, + "loss": 0.1767, + "step": 16590, + "task_loss": 0.4751649498939514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.052861855271285646, + "compression/movement_sparsity/importance_threshold": -0.16533947013702532, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15080714225769043, + "epoch": 6.0, + "learning_rate": 1.1458527773289262e-12, + "loss": 0.1735, + "step": 16600, + "task_loss": 0.34060317277908325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.052894717300334064, + "compression/movement_sparsity/importance_threshold": -0.16489184073468022, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13875526189804077, + "epoch": 6.0, + "learning_rate": 1.9999999283841884e-06, + "loss": 0.1644, + "step": 16610, + "task_loss": 0.36576491594314575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05292751996343522, + "compression/movement_sparsity/importance_threshold": -0.16444501998427363, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16185107827186584, + "epoch": 6.01, + "learning_rate": 1.9999974218318666e-06, + "loss": 0.1683, + "step": 16620, + "task_loss": 0.30015283823013306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05296026331426053, + "compression/movement_sparsity/importance_threshold": -0.1639990071547215, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13772368431091309, + "epoch": 6.01, + "learning_rate": 1.999991334499232e-06, + "loss": 0.1636, + "step": 16630, + "task_loss": 0.5743692517280579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05299294740648142, + "compression/movement_sparsity/importance_threshold": -0.1635538015149396, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17779430747032166, + "epoch": 6.01, + "learning_rate": 1.9999816664080824e-06, + "loss": 0.168, + "step": 16640, + "task_loss": 0.47371160984039307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053025572293769266, + "compression/movement_sparsity/importance_threshold": -0.16310940233384463, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12829241156578064, + "epoch": 6.02, + "learning_rate": 1.999968417593037e-06, + "loss": 0.1725, + "step": 16650, + "task_loss": 0.22890448570251465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05305813802979552, + "compression/movement_sparsity/importance_threshold": -0.1626658088803521, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14386427402496338, + "epoch": 6.02, + "learning_rate": 1.9999515881015373e-06, + "loss": 0.1647, + "step": 16660, + "task_loss": 0.41270148754119873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05309064466823156, + "compression/movement_sparsity/importance_threshold": -0.16222302042337844, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13118350505828857, + "epoch": 6.02, + "learning_rate": 1.999931177993846e-06, + "loss": 0.1577, + "step": 16670, + "task_loss": 0.9137722253799438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05312309226274881, + "compression/movement_sparsity/importance_threshold": -0.16178103623183948, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14832520484924316, + "epoch": 6.03, + "learning_rate": 1.9999071873430475e-06, + "loss": 0.157, + "step": 16680, + "task_loss": 0.44995182752609253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05315548086701868, + "compression/movement_sparsity/importance_threshold": -0.16133985557465147, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12737812101840973, + "epoch": 6.03, + "learning_rate": 1.9998796162350473e-06, + "loss": 0.1638, + "step": 16690, + "task_loss": 0.2989009916782379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053187810534712575, + "compression/movement_sparsity/importance_threshold": -0.16089947772073032, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14384889602661133, + "epoch": 6.04, + "learning_rate": 1.999848464768571e-06, + "loss": 0.1604, + "step": 16700, + "task_loss": 0.636667013168335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053220081319501907, + "compression/movement_sparsity/importance_threshold": -0.1604599019389923, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14905652403831482, + "epoch": 6.04, + "learning_rate": 1.999813733055167e-06, + "loss": 0.1519, + "step": 16710, + "task_loss": 0.5293204188346863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0532522932750581, + "compression/movement_sparsity/importance_threshold": -0.16002112749835318, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14722052216529846, + "epoch": 6.04, + "learning_rate": 1.9997754212192007e-06, + "loss": 0.1677, + "step": 16720, + "task_loss": 0.4709875285625458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05328444645505254, + "compression/movement_sparsity/importance_threshold": -0.15958315366772924, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16393886506557465, + "epoch": 6.05, + "learning_rate": 1.9997335293978595e-06, + "loss": 0.166, + "step": 16730, + "task_loss": 0.4563778340816498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05331654091315667, + "compression/movement_sparsity/importance_threshold": -0.1591459797160365, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1607019305229187, + "epoch": 6.05, + "learning_rate": 1.999688057741149e-06, + "loss": 0.1704, + "step": 16740, + "task_loss": 0.49112173914909363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05334857670304187, + "compression/movement_sparsity/importance_threshold": -0.1587096049121911, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1364935040473938, + "epoch": 6.05, + "learning_rate": 1.999639006411894e-06, + "loss": 0.1494, + "step": 16750, + "task_loss": 0.3835960328578949 + }, + { + "epoch": 6.05, + "eval_exact_match": 83.57615894039735, + "eval_f1": 89.92193376526602, + "step": 16750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053380553878379575, + "compression/movement_sparsity/importance_threshold": -0.15827402852510886, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1750296950340271, + "epoch": 6.06, + "learning_rate": 1.9995863755857365e-06, + "loss": 0.174, + "step": 16760, + "task_loss": 0.39857983589172363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05341247249284118, + "compression/movement_sparsity/importance_threshold": -0.157839249823706, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15796607732772827, + "epoch": 6.06, + "learning_rate": 1.9995301654511367e-06, + "loss": 0.1571, + "step": 16770, + "task_loss": 0.6707438826560974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05344433260009809, + "compression/movement_sparsity/importance_threshold": -0.1574052680768988, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14516857266426086, + "epoch": 6.06, + "learning_rate": 1.999470376209371e-06, + "loss": 0.1627, + "step": 16780, + "task_loss": 0.5306075811386108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053476134253821736, + "compression/movement_sparsity/importance_threshold": -0.15697208255360307, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18241889774799347, + "epoch": 6.07, + "learning_rate": 1.9994070080745324e-06, + "loss": 0.1769, + "step": 16790, + "task_loss": 0.6663118600845337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05350787750768351, + "compression/movement_sparsity/importance_threshold": -0.1565396925227348, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1332705169916153, + "epoch": 6.07, + "learning_rate": 1.9993400612735286e-06, + "loss": 0.1623, + "step": 16800, + "task_loss": 0.518804132938385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053539562415354826, + "compression/movement_sparsity/importance_threshold": -0.1561080972532104, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13718751072883606, + "epoch": 6.08, + "learning_rate": 1.999269536046082e-06, + "loss": 0.1616, + "step": 16810, + "task_loss": 0.29684680700302124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0535711890305071, + "compression/movement_sparsity/importance_threshold": -0.15567729601394564, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16599276661872864, + "epoch": 6.08, + "learning_rate": 1.9991954326447287e-06, + "loss": 0.1692, + "step": 16820, + "task_loss": 0.32731759548187256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05360275740681175, + "compression/movement_sparsity/importance_threshold": -0.15524728807385668, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16123579442501068, + "epoch": 6.08, + "learning_rate": 1.9991177513348175e-06, + "loss": 0.1853, + "step": 16830, + "task_loss": 0.48672449588775635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053634267597940176, + "compression/movement_sparsity/importance_threshold": -0.15481807270185954, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19667679071426392, + "epoch": 6.09, + "learning_rate": 1.9990364923945086e-06, + "loss": 0.1792, + "step": 16840, + "task_loss": 0.5587581992149353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05366571965756378, + "compression/movement_sparsity/importance_threshold": -0.15438964916687037, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16758200526237488, + "epoch": 6.09, + "learning_rate": 1.9989516561147736e-06, + "loss": 0.1634, + "step": 16850, + "task_loss": 0.39092981815338135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053697113639354, + "compression/movement_sparsity/importance_threshold": -0.15396201673780519, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16262221336364746, + "epoch": 6.09, + "learning_rate": 1.9988632427993927e-06, + "loss": 0.1754, + "step": 16860, + "task_loss": 0.632863461971283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05372844959698221, + "compression/movement_sparsity/importance_threshold": -0.15353517468358013, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13204121589660645, + "epoch": 6.1, + "learning_rate": 1.9987712527649556e-06, + "loss": 0.1688, + "step": 16870, + "task_loss": 0.37428024411201477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05375972758411986, + "compression/movement_sparsity/importance_threshold": -0.15310912227311113, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16641512513160706, + "epoch": 6.1, + "learning_rate": 1.9986756863408597e-06, + "loss": 0.1674, + "step": 16880, + "task_loss": 0.3117241859436035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05379094765443833, + "compression/movement_sparsity/importance_threshold": -0.15268385877531432, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15495559573173523, + "epoch": 6.1, + "learning_rate": 1.9985765438693077e-06, + "loss": 0.1739, + "step": 16890, + "task_loss": 0.7027961015701294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053822109861609034, + "compression/movement_sparsity/importance_threshold": -0.15225938345910583, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15895962715148926, + "epoch": 6.11, + "learning_rate": 1.998473825705308e-06, + "loss": 0.1733, + "step": 16900, + "task_loss": 0.5291743874549866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0538532142593034, + "compression/movement_sparsity/importance_threshold": -0.1518356955934017, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15101155638694763, + "epoch": 6.11, + "learning_rate": 1.9983675322166733e-06, + "loss": 0.1672, + "step": 16910, + "task_loss": 0.5095721483230591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053884260901192825, + "compression/movement_sparsity/importance_threshold": -0.15141279444711797, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12188731133937836, + "epoch": 6.11, + "learning_rate": 1.9982576637840178e-06, + "loss": 0.1629, + "step": 16920, + "task_loss": 0.5534610748291016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053915249840948726, + "compression/movement_sparsity/importance_threshold": -0.15099067928917054, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15203392505645752, + "epoch": 6.12, + "learning_rate": 1.9981442208007564e-06, + "loss": 0.1687, + "step": 16930, + "task_loss": 0.4714369475841522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0539461811322425, + "compression/movement_sparsity/importance_threshold": -0.15056934938847588, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1437663435935974, + "epoch": 6.12, + "learning_rate": 1.9980272036731065e-06, + "loss": 0.1692, + "step": 16940, + "task_loss": 0.2952896058559418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053977054828745576, + "compression/movement_sparsity/importance_threshold": -0.1501488040139497, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14852911233901978, + "epoch": 6.13, + "learning_rate": 1.9979066128200797e-06, + "loss": 0.1635, + "step": 16950, + "task_loss": 0.7087572813034058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05400787098412936, + "compression/movement_sparsity/importance_threshold": -0.14972904243450824, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15441861748695374, + "epoch": 6.13, + "learning_rate": 1.997782448673488e-06, + "loss": 0.1647, + "step": 16960, + "task_loss": 0.46169987320899963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05403862965206526, + "compression/movement_sparsity/importance_threshold": -0.14931006391906743, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13028943538665771, + "epoch": 6.13, + "learning_rate": 1.9976547116779365e-06, + "loss": 0.1576, + "step": 16970, + "task_loss": 0.4005166292190552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054069330886224676, + "compression/movement_sparsity/importance_threshold": -0.1488918677365435, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1454225480556488, + "epoch": 6.14, + "learning_rate": 1.9975234022908244e-06, + "loss": 0.1506, + "step": 16980, + "task_loss": 0.6841634511947632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054099974740279035, + "compression/movement_sparsity/importance_threshold": -0.1484744531558524, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15946821868419647, + "epoch": 6.14, + "learning_rate": 1.997388520982343e-06, + "loss": 0.1786, + "step": 16990, + "task_loss": 0.5624659061431885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054130561267899745, + "compression/movement_sparsity/importance_threshold": -0.14805781944591023, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13070663809776306, + "epoch": 6.14, + "learning_rate": 1.997250068235474e-06, + "loss": 0.1544, + "step": 17000, + "task_loss": 0.46055838465690613 + }, + { + "epoch": 6.14, + "eval_exact_match": 83.52885525070955, + "eval_f1": 89.92910145189475, + "step": 17000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05416109052275821, + "compression/movement_sparsity/importance_threshold": -0.14764196587563316, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14563003182411194, + "epoch": 6.15, + "learning_rate": 1.9971080445459876e-06, + "loss": 0.1777, + "step": 17010, + "task_loss": 0.31575658917427063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054191562558525835, + "compression/movement_sparsity/importance_threshold": -0.1472268917139371, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12489091604948044, + "epoch": 6.15, + "learning_rate": 1.9969624504224404e-06, + "loss": 0.1681, + "step": 17020, + "task_loss": 0.3158435821533203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05422197742887405, + "compression/movement_sparsity/importance_threshold": -0.14681259622973808, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1576835811138153, + "epoch": 6.15, + "learning_rate": 1.996813286386175e-06, + "loss": 0.1627, + "step": 17030, + "task_loss": 0.5685656070709229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05425233518747425, + "compression/movement_sparsity/importance_threshold": -0.14639907869195234, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15400083363056183, + "epoch": 6.16, + "learning_rate": 1.9966605529713155e-06, + "loss": 0.1656, + "step": 17040, + "task_loss": 0.24658185243606567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05428263588799785, + "compression/movement_sparsity/importance_threshold": -0.14598633836949582, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13884764909744263, + "epoch": 6.16, + "learning_rate": 1.996504250724769e-06, + "loss": 0.1629, + "step": 17050, + "task_loss": 0.5981327295303345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054312879584116264, + "compression/movement_sparsity/importance_threshold": -0.14557437453128474, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1670169234275818, + "epoch": 6.17, + "learning_rate": 1.9963443802062207e-06, + "loss": 0.1626, + "step": 17060, + "task_loss": 0.34697288274765015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054343066329500894, + "compression/movement_sparsity/importance_threshold": -0.14516318644623505, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16179323196411133, + "epoch": 6.17, + "learning_rate": 1.996180941988133e-06, + "loss": 0.1609, + "step": 17070, + "task_loss": 0.5075516700744629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05437319617782316, + "compression/movement_sparsity/importance_threshold": -0.14475277338326276, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1471652388572693, + "epoch": 6.17, + "learning_rate": 1.9960139366557434e-06, + "loss": 0.1641, + "step": 17080, + "task_loss": 0.41368719935417175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05440326918275447, + "compression/movement_sparsity/importance_threshold": -0.144343134611284, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1576196253299713, + "epoch": 6.18, + "learning_rate": 1.995843364807064e-06, + "loss": 0.179, + "step": 17090, + "task_loss": 0.5158824324607849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05443328539796623, + "compression/movement_sparsity/importance_threshold": -0.14393426939921494, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1396743804216385, + "epoch": 6.18, + "learning_rate": 1.9956692270528757e-06, + "loss": 0.1645, + "step": 17100, + "task_loss": 0.4924278259277344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054463244877129856, + "compression/movement_sparsity/importance_threshold": -0.14352617701597148, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18293413519859314, + "epoch": 6.18, + "learning_rate": 1.9954915240167297e-06, + "loss": 0.1677, + "step": 17110, + "task_loss": 0.60299152135849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05449314767391675, + "compression/movement_sparsity/importance_threshold": -0.14311885673046976, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.167672261595726, + "epoch": 6.19, + "learning_rate": 1.995310256334943e-06, + "loss": 0.1653, + "step": 17120, + "task_loss": 0.39202722907066345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05452299384199834, + "compression/movement_sparsity/importance_threshold": -0.1427123078116258, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18624365329742432, + "epoch": 6.19, + "learning_rate": 1.995125424656597e-06, + "loss": 0.1685, + "step": 17130, + "task_loss": 0.6348949074745178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054552783435046014, + "compression/movement_sparsity/importance_threshold": -0.14230652952835587, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14822807908058167, + "epoch": 6.19, + "learning_rate": 1.9949370296435347e-06, + "loss": 0.1655, + "step": 17140, + "task_loss": 0.44887471199035645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0545825165067312, + "compression/movement_sparsity/importance_threshold": -0.14190152114957577, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1382816731929779, + "epoch": 6.2, + "learning_rate": 1.99474507197036e-06, + "loss": 0.1745, + "step": 17150, + "task_loss": 0.2659206986427307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05461219311072531, + "compression/movement_sparsity/importance_threshold": -0.14149728194420164, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1857585608959198, + "epoch": 6.2, + "learning_rate": 1.9945495523244317e-06, + "loss": 0.1702, + "step": 17160, + "task_loss": 0.5841749906539917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05464181330069973, + "compression/movement_sparsity/importance_threshold": -0.1410938111811496, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14488109946250916, + "epoch": 6.21, + "learning_rate": 1.994350471405865e-06, + "loss": 0.1559, + "step": 17170, + "task_loss": 0.7074770927429199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054671377130325896, + "compression/movement_sparsity/importance_threshold": -0.14069110812933583, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13776680827140808, + "epoch": 6.21, + "learning_rate": 1.994147829927527e-06, + "loss": 0.1737, + "step": 17180, + "task_loss": 0.48739925026893616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05470088465327521, + "compression/movement_sparsity/importance_threshold": -0.1402891720576761, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20274044573307037, + "epoch": 6.21, + "learning_rate": 1.9939416286150343e-06, + "loss": 0.1674, + "step": 17190, + "task_loss": 0.47881031036376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05473033592321909, + "compression/movement_sparsity/importance_threshold": -0.13988800223508668, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1323615312576294, + "epoch": 6.22, + "learning_rate": 1.9937318682067498e-06, + "loss": 0.1702, + "step": 17200, + "task_loss": 0.3447995185852051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05475973099382892, + "compression/movement_sparsity/importance_threshold": -0.13948759793048382, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15247386693954468, + "epoch": 6.22, + "learning_rate": 1.9935185494537817e-06, + "loss": 0.1743, + "step": 17210, + "task_loss": 0.5565829277038574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054789069918776155, + "compression/movement_sparsity/importance_threshold": -0.1390879584127831, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16268417239189148, + "epoch": 6.22, + "learning_rate": 1.9933016731199798e-06, + "loss": 0.1532, + "step": 17220, + "task_loss": 0.5147459506988525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05481835275173216, + "compression/movement_sparsity/importance_threshold": -0.138689082950901, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13939876854419708, + "epoch": 6.23, + "learning_rate": 1.993081239981932e-06, + "loss": 0.1782, + "step": 17230, + "task_loss": 0.19769251346588135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05484757954636838, + "compression/movement_sparsity/importance_threshold": -0.1382909708137533, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1804233193397522, + "epoch": 6.23, + "learning_rate": 1.9928572508289638e-06, + "loss": 0.1763, + "step": 17240, + "task_loss": 0.3552435040473938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0548767503563562, + "compression/movement_sparsity/importance_threshold": -0.1378936212702564, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12657399475574493, + "epoch": 6.23, + "learning_rate": 1.9926297064631324e-06, + "loss": 0.1487, + "step": 17250, + "task_loss": 0.3169633448123932 + }, + { + "epoch": 6.23, + "eval_exact_match": 83.73699148533585, + "eval_f1": 90.10480575796605, + "step": 17250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05490586523536706, + "compression/movement_sparsity/importance_threshold": -0.13749703358932597, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15277908742427826, + "epoch": 6.24, + "learning_rate": 1.9923986076992264e-06, + "loss": 0.171, + "step": 17260, + "task_loss": 0.472973108291626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05493492423707234, + "compression/movement_sparsity/importance_threshold": -0.13710120703987838, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15695881843566895, + "epoch": 6.24, + "learning_rate": 1.9921639553647624e-06, + "loss": 0.1665, + "step": 17270, + "task_loss": 0.6075320839881897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05496392741514347, + "compression/movement_sparsity/importance_threshold": -0.13670614089082955, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18700659275054932, + "epoch": 6.25, + "learning_rate": 1.991925750299981e-06, + "loss": 0.1821, + "step": 17280, + "task_loss": 0.6988071799278259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054992874823251846, + "compression/movement_sparsity/importance_threshold": -0.13631183441109584, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15885579586029053, + "epoch": 6.25, + "learning_rate": 1.9916839933578437e-06, + "loss": 0.162, + "step": 17290, + "task_loss": 0.19446536898612976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05502176651506889, + "compression/movement_sparsity/importance_threshold": -0.13591828686959284, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19656088948249817, + "epoch": 6.25, + "learning_rate": 1.991438685404032e-06, + "loss": 0.1763, + "step": 17300, + "task_loss": 0.423044353723526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05505060254426602, + "compression/movement_sparsity/importance_threshold": -0.1355254975352368, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16628031432628632, + "epoch": 6.26, + "learning_rate": 1.9911898273169412e-06, + "loss": 0.1661, + "step": 17310, + "task_loss": 0.35148802399635315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05507938296451462, + "compression/movement_sparsity/importance_threshold": -0.13513346567694406, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16762763261795044, + "epoch": 6.26, + "learning_rate": 1.990937419987681e-06, + "loss": 0.1662, + "step": 17320, + "task_loss": 0.6487798690795898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05510810782948613, + "compression/movement_sparsity/importance_threshold": -0.13474219056363024, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1576879620552063, + "epoch": 6.26, + "learning_rate": 1.9906814643200674e-06, + "loss": 0.1753, + "step": 17330, + "task_loss": 0.31458860635757446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.055136777192851934, + "compression/movement_sparsity/importance_threshold": -0.13435167146421179, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12366840243339539, + "epoch": 6.27, + "learning_rate": 1.9904219612306246e-06, + "loss": 0.1771, + "step": 17340, + "task_loss": 0.3803785443305969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05516539110828347, + "compression/movement_sparsity/importance_threshold": -0.13396190764760452, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16160424053668976, + "epoch": 6.27, + "learning_rate": 1.9901589116485788e-06, + "loss": 0.1682, + "step": 17350, + "task_loss": 0.5825801491737366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05519394962945213, + "compression/movement_sparsity/importance_threshold": -0.13357289838272457, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.225599467754364, + "epoch": 6.27, + "learning_rate": 1.9898923165158548e-06, + "loss": 0.1655, + "step": 17360, + "task_loss": 0.45244693756103516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05522245281002932, + "compression/movement_sparsity/importance_threshold": -0.1331846429384882, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16648636758327484, + "epoch": 6.28, + "learning_rate": 1.989622176787074e-06, + "loss": 0.164, + "step": 17370, + "task_loss": 0.3290000557899475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05525090070368647, + "compression/movement_sparsity/importance_threshold": -0.1327971405838111, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18633422255516052, + "epoch": 6.28, + "learning_rate": 1.9893484934295492e-06, + "loss": 0.1719, + "step": 17380, + "task_loss": 0.5563036799430847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05527929336409498, + "compression/movement_sparsity/importance_threshold": -0.13241039058760962, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1366187334060669, + "epoch": 6.28, + "learning_rate": 1.9890712674232838e-06, + "loss": 0.1834, + "step": 17390, + "task_loss": 0.3646396994590759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05530763084492626, + "compression/movement_sparsity/importance_threshold": -0.1320243922187998, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16789717972278595, + "epoch": 6.29, + "learning_rate": 1.9887904997609654e-06, + "loss": 0.1746, + "step": 17400, + "task_loss": 0.47817733883857727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05533591319985172, + "compression/movement_sparsity/importance_threshold": -0.13163914474629757, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21855862438678741, + "epoch": 6.29, + "learning_rate": 1.9885061914479633e-06, + "loss": 0.1875, + "step": 17410, + "task_loss": 0.40912604331970215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05536414048254276, + "compression/movement_sparsity/importance_threshold": -0.13125464743901927, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1337123066186905, + "epoch": 6.3, + "learning_rate": 1.9882183435023266e-06, + "loss": 0.166, + "step": 17420, + "task_loss": 0.3562582731246948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05539231274667082, + "compression/movement_sparsity/importance_threshold": -0.1308708995658806, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16915687918663025, + "epoch": 6.3, + "learning_rate": 1.987926956954778e-06, + "loss": 0.1834, + "step": 17430, + "task_loss": 0.5122113227844238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.055420430045907285, + "compression/movement_sparsity/importance_threshold": -0.13048790039579783, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14700746536254883, + "epoch": 6.3, + "learning_rate": 1.9876320328487113e-06, + "loss": 0.1639, + "step": 17440, + "task_loss": 0.47502970695495605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05544849243392357, + "compression/movement_sparsity/importance_threshold": -0.13010564919768708, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1848972737789154, + "epoch": 6.31, + "learning_rate": 1.9873335722401875e-06, + "loss": 0.1724, + "step": 17450, + "task_loss": 0.7271729707717896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0554764999643911, + "compression/movement_sparsity/importance_threshold": -0.12972414524046427, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11952685564756393, + "epoch": 6.31, + "learning_rate": 1.9870315761979317e-06, + "loss": 0.1617, + "step": 17460, + "task_loss": 0.1481960266828537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05550445269098127, + "compression/movement_sparsity/importance_threshold": -0.12934338779304566, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14205265045166016, + "epoch": 6.31, + "learning_rate": 1.9867260458033276e-06, + "loss": 0.1649, + "step": 17470, + "task_loss": 0.2205222249031067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0555323506673655, + "compression/movement_sparsity/importance_threshold": -0.12896337612434705, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14286062121391296, + "epoch": 6.32, + "learning_rate": 1.986416982150416e-06, + "loss": 0.1646, + "step": 17480, + "task_loss": 0.28470635414123535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05556019394721519, + "compression/movement_sparsity/importance_threshold": -0.1285841095032848, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1712353527545929, + "epoch": 6.32, + "learning_rate": 1.9861043863458876e-06, + "loss": 0.1574, + "step": 17490, + "task_loss": 0.5386769771575928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05558798258420176, + "compression/movement_sparsity/importance_threshold": -0.12820558719877473, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13892130553722382, + "epoch": 6.32, + "learning_rate": 1.9857882595090833e-06, + "loss": 0.1648, + "step": 17500, + "task_loss": 0.49137255549430847 + }, + { + "epoch": 6.32, + "eval_exact_match": 83.58561967833491, + "eval_f1": 90.0004746665234, + "step": 17500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05561571663199662, + "compression/movement_sparsity/importance_threshold": -0.12782780847973296, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16053462028503418, + "epoch": 6.33, + "learning_rate": 1.985468602771986e-06, + "loss": 0.1744, + "step": 17510, + "task_loss": 0.41019684076309204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05564339614427117, + "compression/movement_sparsity/importance_threshold": -0.12745077261507576, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1520998179912567, + "epoch": 6.33, + "learning_rate": 1.98514541727922e-06, + "loss": 0.1696, + "step": 17520, + "task_loss": 0.5022592544555664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05567102117469682, + "compression/movement_sparsity/importance_threshold": -0.12707447887371903, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12280330061912537, + "epoch": 6.34, + "learning_rate": 1.984818704188044e-06, + "loss": 0.1538, + "step": 17530, + "task_loss": 0.34392112493515015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05569859177694501, + "compression/movement_sparsity/importance_threshold": -0.12669892652457881, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2284255176782608, + "epoch": 6.34, + "learning_rate": 1.9844884646683487e-06, + "loss": 0.1699, + "step": 17540, + "task_loss": 0.39971989393234253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05572610800468712, + "compression/movement_sparsity/importance_threshold": -0.12632411483657113, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15068349242210388, + "epoch": 6.34, + "learning_rate": 1.984154699902653e-06, + "loss": 0.1684, + "step": 17550, + "task_loss": 0.36631715297698975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05575356991159457, + "compression/movement_sparsity/importance_threshold": -0.12595004307861224, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12671113014221191, + "epoch": 6.35, + "learning_rate": 1.983817411086097e-06, + "loss": 0.1697, + "step": 17560, + "task_loss": 0.49995699524879456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.055780977551338776, + "compression/movement_sparsity/importance_threshold": -0.12557671051961794, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1240062266588211, + "epoch": 6.35, + "learning_rate": 1.9834765994264426e-06, + "loss": 0.1615, + "step": 17570, + "task_loss": 0.3404150605201721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.055808330977591136, + "compression/movement_sparsity/importance_threshold": -0.1252041164285046, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15258857607841492, + "epoch": 6.35, + "learning_rate": 1.983132266144064e-06, + "loss": 0.1698, + "step": 17580, + "task_loss": 0.7967875599861145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05583563024402308, + "compression/movement_sparsity/importance_threshold": -0.12483226007418802, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14428968727588654, + "epoch": 6.36, + "learning_rate": 1.9827844124719453e-06, + "loss": 0.1595, + "step": 17590, + "task_loss": 0.44335755705833435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05586287540430599, + "compression/movement_sparsity/importance_threshold": -0.12446114072558456, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19588825106620789, + "epoch": 6.36, + "learning_rate": 1.9824330396556784e-06, + "loss": 0.1669, + "step": 17600, + "task_loss": 0.3872692286968231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0558900665121113, + "compression/movement_sparsity/importance_threshold": -0.12409075765161004, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15780013799667358, + "epoch": 6.36, + "learning_rate": 1.982078148953455e-06, + "loss": 0.1693, + "step": 17610, + "task_loss": 0.36154231429100037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05591720362111042, + "compression/movement_sparsity/importance_threshold": -0.12372111012118059, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18099361658096313, + "epoch": 6.37, + "learning_rate": 1.981719741636064e-06, + "loss": 0.1772, + "step": 17620, + "task_loss": 0.3962492346763611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.055944286784974755, + "compression/movement_sparsity/importance_threshold": -0.12335219740321224, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1900426745414734, + "epoch": 6.37, + "learning_rate": 1.981357818986887e-06, + "loss": 0.1792, + "step": 17630, + "task_loss": 0.5194408893585205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0559713160573757, + "compression/movement_sparsity/importance_threshold": -0.12298401876662124, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15779553353786469, + "epoch": 6.38, + "learning_rate": 1.9809923823018917e-06, + "loss": 0.1569, + "step": 17640, + "task_loss": 0.47046637535095215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05599829149198469, + "compression/movement_sparsity/importance_threshold": -0.1226165734803234, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.134331613779068, + "epoch": 6.38, + "learning_rate": 1.980623432889631e-06, + "loss": 0.1596, + "step": 17650, + "task_loss": 0.3972647786140442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05602521314247313, + "compression/movement_sparsity/importance_threshold": -0.12224986081323497, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18308016657829285, + "epoch": 6.38, + "learning_rate": 1.9802509720712354e-06, + "loss": 0.1725, + "step": 17660, + "task_loss": 0.6477333307266235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05605208106251243, + "compression/movement_sparsity/importance_threshold": -0.12188388003427186, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13036686182022095, + "epoch": 6.39, + "learning_rate": 1.9798750011804076e-06, + "loss": 0.1592, + "step": 17670, + "task_loss": 0.31419873237609863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.056078895305773986, + "compression/movement_sparsity/importance_threshold": -0.12151863041235034, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1570032238960266, + "epoch": 6.39, + "learning_rate": 1.979495521563421e-06, + "loss": 0.1522, + "step": 17680, + "task_loss": 0.37213003635406494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.056105655925929226, + "compression/movement_sparsity/importance_threshold": -0.12115411121638642, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15974989533424377, + "epoch": 6.39, + "learning_rate": 1.9791125345791115e-06, + "loss": 0.165, + "step": 17690, + "task_loss": 0.33860349655151367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05613236297664955, + "compression/movement_sparsity/importance_threshold": -0.12079032171529602, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13290277123451233, + "epoch": 6.4, + "learning_rate": 1.9787260415988757e-06, + "loss": 0.1589, + "step": 17700, + "task_loss": 0.2330818772315979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05615901651160638, + "compression/movement_sparsity/importance_threshold": -0.12042726117799529, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15578773617744446, + "epoch": 6.4, + "learning_rate": 1.9783360440066637e-06, + "loss": 0.1779, + "step": 17710, + "task_loss": 0.7083296775817871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05618561658447112, + "compression/movement_sparsity/importance_threshold": -0.12006492887340048, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15509989857673645, + "epoch": 6.4, + "learning_rate": 1.977942543198974e-06, + "loss": 0.1666, + "step": 17720, + "task_loss": 0.4361785352230072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.056212163248915176, + "compression/movement_sparsity/importance_threshold": -0.11970332407042727, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17246806621551514, + "epoch": 6.41, + "learning_rate": 1.9775455405848506e-06, + "loss": 0.171, + "step": 17730, + "task_loss": 0.756430983543396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05623865655860996, + "compression/movement_sparsity/importance_threshold": -0.11934244603799216, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14905114471912384, + "epoch": 6.41, + "learning_rate": 1.977145037585877e-06, + "loss": 0.171, + "step": 17740, + "task_loss": 0.33792349696159363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05626509656722689, + "compression/movement_sparsity/importance_threshold": -0.11898229404501082, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13120922446250916, + "epoch": 6.41, + "learning_rate": 1.9767410356361683e-06, + "loss": 0.1557, + "step": 17750, + "task_loss": 0.32760465145111084 + }, + { + "epoch": 6.41, + "eval_exact_match": 83.59508041627247, + "eval_f1": 89.9605980268485, + "step": 17750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05629148332843737, + "compression/movement_sparsity/importance_threshold": -0.11862286736039962, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17200222611427307, + "epoch": 6.42, + "learning_rate": 1.9763335361823723e-06, + "loss": 0.1676, + "step": 17760, + "task_loss": 0.4891778826713562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05631781689591282, + "compression/movement_sparsity/importance_threshold": -0.11826416525307437, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14608396589756012, + "epoch": 6.42, + "learning_rate": 1.975922540683658e-06, + "loss": 0.1627, + "step": 17770, + "task_loss": 0.21372440457344055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.056344097323324634, + "compression/movement_sparsity/importance_threshold": -0.11790618699195143, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15041396021842957, + "epoch": 6.43, + "learning_rate": 1.975508050611714e-06, + "loss": 0.1692, + "step": 17780, + "task_loss": 0.372314989566803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05637032466434423, + "compression/movement_sparsity/importance_threshold": -0.11754893184594672, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17189109325408936, + "epoch": 6.43, + "learning_rate": 1.975090067450742e-06, + "loss": 0.186, + "step": 17790, + "task_loss": 0.3796417713165283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05639649897264304, + "compression/movement_sparsity/importance_threshold": -0.11719239908397616, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14438897371292114, + "epoch": 6.43, + "learning_rate": 1.9746685926974515e-06, + "loss": 0.1726, + "step": 17800, + "task_loss": 0.4204254746437073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05642262030189243, + "compression/movement_sparsity/importance_threshold": -0.11683658797495611, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15841056406497955, + "epoch": 6.44, + "learning_rate": 1.9742436278610548e-06, + "loss": 0.1613, + "step": 17810, + "task_loss": 0.23542027175426483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05644868870576385, + "compression/movement_sparsity/importance_threshold": -0.11648149778780237, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17361599206924438, + "epoch": 6.44, + "learning_rate": 1.9738151744632616e-06, + "loss": 0.195, + "step": 17820, + "task_loss": 0.2202863097190857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05647470423792869, + "compression/movement_sparsity/importance_threshold": -0.11612712779143108, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14202536642551422, + "epoch": 6.44, + "learning_rate": 1.973383234038274e-06, + "loss": 0.1661, + "step": 17830, + "task_loss": 0.30595502257347107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05650066695205837, + "compression/movement_sparsity/importance_threshold": -0.1157734772547585, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1708425134420395, + "epoch": 6.45, + "learning_rate": 1.972947808132779e-06, + "loss": 0.1743, + "step": 17840, + "task_loss": 0.849858820438385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0565265769018243, + "compression/movement_sparsity/importance_threshold": -0.11542054544670044, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15541870892047882, + "epoch": 6.45, + "learning_rate": 1.972508898305946e-06, + "loss": 0.1744, + "step": 17850, + "task_loss": 0.4322483241558075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05655243414089788, + "compression/movement_sparsity/importance_threshold": -0.11506833163617303, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15814724564552307, + "epoch": 6.45, + "learning_rate": 1.9720665061294173e-06, + "loss": 0.171, + "step": 17860, + "task_loss": 0.2430177628993988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05657823872295053, + "compression/movement_sparsity/importance_threshold": -0.11471683509209252, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14094632863998413, + "epoch": 6.46, + "learning_rate": 1.9716206331873075e-06, + "loss": 0.175, + "step": 17870, + "task_loss": 0.36099082231521606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05660399070165367, + "compression/movement_sparsity/importance_threshold": -0.11436605508337472, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15439045429229736, + "epoch": 6.46, + "learning_rate": 1.971171281076193e-06, + "loss": 0.1787, + "step": 17880, + "task_loss": 0.5687848329544067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.056629690130678687, + "compression/movement_sparsity/importance_threshold": -0.11401599087893588, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16982746124267578, + "epoch": 6.47, + "learning_rate": 1.9707184514051093e-06, + "loss": 0.1699, + "step": 17890, + "task_loss": 0.5424094200134277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05665533706369701, + "compression/movement_sparsity/importance_threshold": -0.11366664174769192, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1268695890903473, + "epoch": 6.47, + "learning_rate": 1.970262145795545e-06, + "loss": 0.1744, + "step": 17900, + "task_loss": 0.22294151782989502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.056680931554380046, + "compression/movement_sparsity/importance_threshold": -0.11331800695855909, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15782135725021362, + "epoch": 6.47, + "learning_rate": 1.969802365881433e-06, + "loss": 0.1716, + "step": 17910, + "task_loss": 0.6972410678863525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0567064736563992, + "compression/movement_sparsity/importance_threshold": -0.11297008578045331, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1635473221540451, + "epoch": 6.48, + "learning_rate": 1.969339113309149e-06, + "loss": 0.1581, + "step": 17920, + "task_loss": 0.24740125238895416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05673196342342589, + "compression/movement_sparsity/importance_threshold": -0.1126228774822906, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16152726113796234, + "epoch": 6.48, + "learning_rate": 1.9688723897375036e-06, + "loss": 0.1625, + "step": 17930, + "task_loss": 0.4657655954360962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05675740090913152, + "compression/movement_sparsity/importance_threshold": -0.11227638133298723, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.24781015515327454, + "epoch": 6.48, + "learning_rate": 1.968402196837735e-06, + "loss": 0.1717, + "step": 17940, + "task_loss": 0.33787935972213745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05678278616718751, + "compression/movement_sparsity/importance_threshold": -0.11193059660145899, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15699215233325958, + "epoch": 6.49, + "learning_rate": 1.9679285362935054e-06, + "loss": 0.1605, + "step": 17950, + "task_loss": 0.3983635902404785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.056808119251265256, + "compression/movement_sparsity/importance_threshold": -0.11158552255662224, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14720289409160614, + "epoch": 6.49, + "learning_rate": 1.967451409800893e-06, + "loss": 0.1625, + "step": 17960, + "task_loss": 0.5441750288009644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.056833400215036176, + "compression/movement_sparsity/importance_threshold": -0.11124115846739302, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.10097290575504303, + "epoch": 6.49, + "learning_rate": 1.966970819068388e-06, + "loss": 0.168, + "step": 17970, + "task_loss": 0.5651923418045044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05685862911217168, + "compression/movement_sparsity/importance_threshold": -0.11089750360268713, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12678371369838715, + "epoch": 6.5, + "learning_rate": 1.966486765816884e-06, + "loss": 0.1623, + "step": 17980, + "task_loss": 0.2238824963569641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05688380599634319, + "compression/movement_sparsity/importance_threshold": -0.11055455723142082, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1631462424993515, + "epoch": 6.5, + "learning_rate": 1.9659992517796746e-06, + "loss": 0.1727, + "step": 17990, + "task_loss": 0.4262044131755829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.056908930921222095, + "compression/movement_sparsity/importance_threshold": -0.11021231862251013, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15143990516662598, + "epoch": 6.51, + "learning_rate": 1.965508278702444e-06, + "loss": 0.1656, + "step": 18000, + "task_loss": 0.20069824159145355 + }, + { + "epoch": 6.51, + "eval_exact_match": 83.56669820245979, + "eval_f1": 89.97247850295564, + "step": 18000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05693400394047982, + "compression/movement_sparsity/importance_threshold": -0.10987078704487119, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16191574931144714, + "epoch": 6.51, + "learning_rate": 1.9650138483432644e-06, + "loss": 0.1679, + "step": 18010, + "task_loss": 0.2866957187652588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05695902510778778, + "compression/movement_sparsity/importance_threshold": -0.10952996176741991, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17104315757751465, + "epoch": 6.51, + "learning_rate": 1.964515962472586e-06, + "loss": 0.1615, + "step": 18020, + "task_loss": 0.278532475233078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05698399447681737, + "compression/movement_sparsity/importance_threshold": -0.10918984205907256, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17607031762599945, + "epoch": 6.52, + "learning_rate": 1.9640146228732343e-06, + "loss": 0.1609, + "step": 18030, + "task_loss": 0.624789834022522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05700891210124001, + "compression/movement_sparsity/importance_threshold": -0.10885042718874505, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18854406476020813, + "epoch": 6.52, + "learning_rate": 1.9635098313404e-06, + "loss": 0.1678, + "step": 18040, + "task_loss": 0.5639091730117798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05703377803472711, + "compression/movement_sparsity/importance_threshold": -0.10851171642535351, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18219125270843506, + "epoch": 6.52, + "learning_rate": 1.963001589681636e-06, + "loss": 0.1618, + "step": 18050, + "task_loss": 0.41501083970069885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057058592330950085, + "compression/movement_sparsity/importance_threshold": -0.10817370903781398, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1165059357881546, + "epoch": 6.53, + "learning_rate": 1.962541223834976e-06, + "loss": 0.1657, + "step": 18060, + "task_loss": 0.17311452329158783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05708335504358033, + "compression/movement_sparsity/importance_threshold": -0.10783640429504271, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16131412982940674, + "epoch": 6.53, + "learning_rate": 1.9620264319609926e-06, + "loss": 0.1773, + "step": 18070, + "task_loss": 0.451940655708313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05710806622628928, + "compression/movement_sparsity/importance_threshold": -0.10749980146595539, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15116122364997864, + "epoch": 6.53, + "learning_rate": 1.96150819527282e-06, + "loss": 0.1819, + "step": 18080, + "task_loss": 0.49812251329421997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05713272593274832, + "compression/movement_sparsity/importance_threshold": -0.10716389981946839, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1451411247253418, + "epoch": 6.54, + "learning_rate": 1.960986515626155e-06, + "loss": 0.1655, + "step": 18090, + "task_loss": 0.2867373824119568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05715733421662887, + "compression/movement_sparsity/importance_threshold": -0.10682869862449773, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14213800430297852, + "epoch": 6.54, + "learning_rate": 1.960461394889023e-06, + "loss": 0.1562, + "step": 18100, + "task_loss": 0.3731827139854431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05718189113160235, + "compression/movement_sparsity/importance_threshold": -0.10649419714995934, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1480969786643982, + "epoch": 6.54, + "learning_rate": 1.95993283494177e-06, + "loss": 0.1588, + "step": 18110, + "task_loss": 0.45702794194221497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05720639673134016, + "compression/movement_sparsity/importance_threshold": -0.10616039466476945, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16650226712226868, + "epoch": 6.55, + "learning_rate": 1.9594008376770582e-06, + "loss": 0.1623, + "step": 18120, + "task_loss": 0.3911457359790802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05723085106951372, + "compression/movement_sparsity/importance_threshold": -0.105827290437844, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1613444685935974, + "epoch": 6.55, + "learning_rate": 1.9588654049998583e-06, + "loss": 0.1854, + "step": 18130, + "task_loss": 0.46520888805389404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057255254199794436, + "compression/movement_sparsity/importance_threshold": -0.10549488373809901, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15832461416721344, + "epoch": 6.56, + "learning_rate": 1.958326538827442e-06, + "loss": 0.1664, + "step": 18140, + "task_loss": 0.6141856908798218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0572796061758537, + "compression/movement_sparsity/importance_threshold": -0.10516317383445084, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2309219241142273, + "epoch": 6.56, + "learning_rate": 1.9577842410893747e-06, + "loss": 0.1814, + "step": 18150, + "task_loss": 0.7296000719070435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05730390705136296, + "compression/movement_sparsity/importance_threshold": -0.10483215999581541, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1386280059814453, + "epoch": 6.56, + "learning_rate": 1.9572385137275114e-06, + "loss": 0.1704, + "step": 18160, + "task_loss": 0.33172696828842163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05732815687999359, + "compression/movement_sparsity/importance_threshold": -0.10450184149110864, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16742317378520966, + "epoch": 6.57, + "learning_rate": 1.9566893586959866e-06, + "loss": 0.1627, + "step": 18170, + "task_loss": 0.7185513973236084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05735235571541704, + "compression/movement_sparsity/importance_threshold": -0.10417221758924655, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14879506826400757, + "epoch": 6.57, + "learning_rate": 1.956136777961209e-06, + "loss": 0.1696, + "step": 18180, + "task_loss": 0.5774948596954346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057376503611304676, + "compression/movement_sparsity/importance_threshold": -0.10384328755914563, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1316477358341217, + "epoch": 6.57, + "learning_rate": 1.955580773501854e-06, + "loss": 0.1585, + "step": 18190, + "task_loss": 0.15516257286071777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05740060062132793, + "compression/movement_sparsity/importance_threshold": -0.10351505066972155, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13433092832565308, + "epoch": 6.58, + "learning_rate": 1.955021347308856e-06, + "loss": 0.1766, + "step": 18200, + "task_loss": 0.3908587694168091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05742464679915822, + "compression/movement_sparsity/importance_threshold": -0.10318750618989048, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12410083413124084, + "epoch": 6.58, + "learning_rate": 1.954458501385403e-06, + "loss": 0.1743, + "step": 18210, + "task_loss": 0.5834453105926514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05744864219846695, + "compression/movement_sparsity/importance_threshold": -0.10286065338856853, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14193576574325562, + "epoch": 6.58, + "learning_rate": 1.953892237746928e-06, + "loss": 0.1661, + "step": 18220, + "task_loss": 0.4615253806114197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05747258687292553, + "compression/movement_sparsity/importance_threshold": -0.10253449153467176, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17011338472366333, + "epoch": 6.59, + "learning_rate": 1.9533225584211015e-06, + "loss": 0.1622, + "step": 18230, + "task_loss": 0.2667291462421417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05749648087620537, + "compression/movement_sparsity/importance_threshold": -0.10220901989711617, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.156767338514328, + "epoch": 6.59, + "learning_rate": 1.952749465447825e-06, + "loss": 0.1639, + "step": 18240, + "task_loss": 0.37695831060409546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057520324261977875, + "compression/movement_sparsity/importance_threshold": -0.10188423774481792, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13007321953773499, + "epoch": 6.6, + "learning_rate": 1.9521729608792247e-06, + "loss": 0.1629, + "step": 18250, + "task_loss": 0.50368332862854 + }, + { + "epoch": 6.6, + "eval_exact_match": 83.60454115421003, + "eval_f1": 89.95005100815796, + "step": 18250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05754411708391447, + "compression/movement_sparsity/importance_threshold": -0.10156014434669303, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1553541123867035, + "epoch": 6.6, + "learning_rate": 1.9515930467796414e-06, + "loss": 0.1703, + "step": 18260, + "task_loss": 0.3957485556602478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05756785939568656, + "compression/movement_sparsity/importance_threshold": -0.10123673897165752, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18168096244335175, + "epoch": 6.6, + "learning_rate": 1.9510097252256255e-06, + "loss": 0.1604, + "step": 18270, + "task_loss": 0.38172927498817444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057591551250965545, + "compression/movement_sparsity/importance_threshold": -0.10091402088862766, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17573542892932892, + "epoch": 6.61, + "learning_rate": 1.9504229983059294e-06, + "loss": 0.1685, + "step": 18280, + "task_loss": 0.23665164411067963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05761519270342284, + "compression/movement_sparsity/importance_threshold": -0.10059198936651925, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14210020005702972, + "epoch": 6.61, + "learning_rate": 1.949832868121498e-06, + "loss": 0.168, + "step": 18290, + "task_loss": 0.3023013472557068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05763878380672986, + "compression/movement_sparsity/importance_threshold": -0.10027064367424865, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1647496223449707, + "epoch": 6.61, + "learning_rate": 1.9492393367854633e-06, + "loss": 0.1695, + "step": 18300, + "task_loss": 0.5862131118774414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05766232461455803, + "compression/movement_sparsity/importance_threshold": -0.09994998308073144, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18408089876174927, + "epoch": 6.62, + "learning_rate": 1.9486424064231367e-06, + "loss": 0.1789, + "step": 18310, + "task_loss": 0.5712409615516663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05768581518057873, + "compression/movement_sparsity/importance_threshold": -0.09963000685488421, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16597400605678558, + "epoch": 6.62, + "learning_rate": 1.9480420791719995e-06, + "loss": 0.172, + "step": 18320, + "task_loss": 0.297050803899765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05770925555846339, + "compression/movement_sparsity/importance_threshold": -0.09931071426562277, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15080735087394714, + "epoch": 6.62, + "learning_rate": 1.9474383571816978e-06, + "loss": 0.1865, + "step": 18330, + "task_loss": 0.6137457489967346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05773264580188341, + "compression/movement_sparsity/importance_threshold": -0.09899210458186325, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13079148530960083, + "epoch": 6.63, + "learning_rate": 1.9468312426140326e-06, + "loss": 0.1616, + "step": 18340, + "task_loss": 0.3845100402832031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057755985964510224, + "compression/movement_sparsity/importance_threshold": -0.09867417707252157, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19262845814228058, + "epoch": 6.63, + "learning_rate": 1.9462207376429537e-06, + "loss": 0.1708, + "step": 18350, + "task_loss": 0.3163360357284546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05777927610001521, + "compression/movement_sparsity/importance_threshold": -0.0983569310065141, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15635263919830322, + "epoch": 6.64, + "learning_rate": 1.9456068444545504e-06, + "loss": 0.1548, + "step": 18360, + "task_loss": 0.3761829733848572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0578025162620698, + "compression/movement_sparsity/importance_threshold": -0.09804036565275664, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14289672672748566, + "epoch": 6.64, + "learning_rate": 1.944989565247046e-06, + "loss": 0.167, + "step": 18370, + "task_loss": 0.3219255805015564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057825706504345396, + "compression/movement_sparsity/importance_threshold": -0.09772448028016545, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1459888368844986, + "epoch": 6.64, + "learning_rate": 1.944368902230786e-06, + "loss": 0.1658, + "step": 18380, + "task_loss": 0.33508729934692383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057848846880513415, + "compression/movement_sparsity/importance_threshold": -0.09740927415765632, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15043103694915771, + "epoch": 6.65, + "learning_rate": 1.943744857628235e-06, + "loss": 0.1693, + "step": 18390, + "task_loss": 0.3376499116420746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057871937444245264, + "compression/movement_sparsity/importance_threshold": -0.09709474655414563, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1579783409833908, + "epoch": 6.65, + "learning_rate": 1.9431174336739656e-06, + "loss": 0.1664, + "step": 18400, + "task_loss": 0.4792447090148926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05789497824921235, + "compression/movement_sparsity/importance_threshold": -0.09678089673854928, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12249864637851715, + "epoch": 6.65, + "learning_rate": 1.9424866326146506e-06, + "loss": 0.1631, + "step": 18410, + "task_loss": 0.4236283302307129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05791796934908609, + "compression/movement_sparsity/importance_threshold": -0.09646772397978332, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15789994597434998, + "epoch": 6.66, + "learning_rate": 1.941852456709056e-06, + "loss": 0.166, + "step": 18420, + "task_loss": 0.3791453540325165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057940910797537895, + "compression/movement_sparsity/importance_threshold": -0.09615522754676387, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11831511557102203, + "epoch": 6.66, + "learning_rate": 1.9412149082280315e-06, + "loss": 0.1675, + "step": 18430, + "task_loss": 0.28735044598579407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057963802648239166, + "compression/movement_sparsity/importance_threshold": -0.09584340670840708, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17035847902297974, + "epoch": 6.66, + "learning_rate": 1.9405739894545044e-06, + "loss": 0.1814, + "step": 18440, + "task_loss": 0.3237995505332947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.057986644954861326, + "compression/movement_sparsity/importance_threshold": -0.09553226073362886, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14559730887413025, + "epoch": 6.67, + "learning_rate": 1.9399297026834707e-06, + "loss": 0.1559, + "step": 18450, + "task_loss": 0.3083050549030304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05800943777107577, + "compression/movement_sparsity/importance_threshold": -0.09522178889134547, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1738000512123108, + "epoch": 6.67, + "learning_rate": 1.939282050221985e-06, + "loss": 0.1549, + "step": 18460, + "task_loss": 0.7337498664855957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058032181150553926, + "compression/movement_sparsity/importance_threshold": -0.0949119904504726, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13351115584373474, + "epoch": 6.68, + "learning_rate": 1.9386310343891546e-06, + "loss": 0.1679, + "step": 18470, + "task_loss": 0.33444127440452576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05805487514696719, + "compression/movement_sparsity/importance_threshold": -0.09460286467992685, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15732640027999878, + "epoch": 6.68, + "learning_rate": 1.9379766575161305e-06, + "loss": 0.1589, + "step": 18480, + "task_loss": 0.35811203718185425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05807751981398699, + "compression/movement_sparsity/importance_threshold": -0.09429441084862367, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14750367403030396, + "epoch": 6.68, + "learning_rate": 1.937318921946098e-06, + "loss": 0.1596, + "step": 18490, + "task_loss": 0.21471939980983734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05810011520528472, + "compression/movement_sparsity/importance_threshold": -0.09398662822547976, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16651684045791626, + "epoch": 6.69, + "learning_rate": 1.93665783003427e-06, + "loss": 0.167, + "step": 18500, + "task_loss": 0.6148971319198608 + }, + { + "epoch": 6.69, + "eval_exact_match": 83.69914853358561, + "eval_f1": 90.03513064764907, + "step": 18500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058122661374531794, + "compression/movement_sparsity/importance_threshold": -0.09367951607941072, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16546697914600372, + "epoch": 6.69, + "learning_rate": 1.935993384147878e-06, + "loss": 0.1774, + "step": 18510, + "task_loss": 0.5665580630302429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05814515837539962, + "compression/movement_sparsity/importance_threshold": -0.0933730736793329, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14349332451820374, + "epoch": 6.69, + "learning_rate": 1.9353255866661615e-06, + "loss": 0.1656, + "step": 18520, + "task_loss": 0.6033084988594055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058167606261559625, + "compression/movement_sparsity/importance_threshold": -0.09306730029416221, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14493785798549652, + "epoch": 6.7, + "learning_rate": 1.9346544399803647e-06, + "loss": 0.1629, + "step": 18530, + "task_loss": 0.3323573172092438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05819000508668321, + "compression/movement_sparsity/importance_threshold": -0.0927621951928147, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12968416512012482, + "epoch": 6.7, + "learning_rate": 1.933979946493721e-06, + "loss": 0.16, + "step": 18540, + "task_loss": 0.27857789397239685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05821235490444178, + "compression/movement_sparsity/importance_threshold": -0.0924577576442066, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14049646258354187, + "epoch": 6.7, + "learning_rate": 1.93330210862145e-06, + "loss": 0.1607, + "step": 18550, + "task_loss": 0.4084530472755432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05823465576850675, + "compression/movement_sparsity/importance_threshold": -0.09215398691725374, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15300214290618896, + "epoch": 6.71, + "learning_rate": 1.9326209287907472e-06, + "loss": 0.1835, + "step": 18560, + "task_loss": 0.3902439773082733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05825690773254952, + "compression/movement_sparsity/importance_threshold": -0.09185088228087246, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1685321033000946, + "epoch": 6.71, + "learning_rate": 1.9319364094407734e-06, + "loss": 0.1601, + "step": 18570, + "task_loss": 0.27366265654563904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05827911085024152, + "compression/movement_sparsity/importance_threshold": -0.09154844300397869, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18406498432159424, + "epoch": 6.71, + "learning_rate": 1.931248553022649e-06, + "loss": 0.1699, + "step": 18580, + "task_loss": 0.5142942070960999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05830126517525415, + "compression/movement_sparsity/importance_threshold": -0.09124666835548845, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15226216614246368, + "epoch": 6.72, + "learning_rate": 1.9305573619994426e-06, + "loss": 0.1693, + "step": 18590, + "task_loss": 0.43030428886413574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058323370761258826, + "compression/movement_sparsity/importance_threshold": -0.09094555760431777, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14825284481048584, + "epoch": 6.72, + "learning_rate": 1.929862838846164e-06, + "loss": 0.1628, + "step": 18600, + "task_loss": 0.4481026530265808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05834542766192694, + "compression/movement_sparsity/importance_threshold": -0.09064511001938291, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1773342490196228, + "epoch": 6.73, + "learning_rate": 1.929164986049754e-06, + "loss": 0.1774, + "step": 18610, + "task_loss": 0.4238778352737427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05836743593092993, + "compression/movement_sparsity/importance_threshold": -0.09034532486959979, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16140668094158173, + "epoch": 6.73, + "learning_rate": 1.928463806109077e-06, + "loss": 0.1705, + "step": 18620, + "task_loss": 0.650934100151062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05838939562193919, + "compression/movement_sparsity/importance_threshold": -0.09004620142388442, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17616376280784607, + "epoch": 6.73, + "learning_rate": 1.9277593015349107e-06, + "loss": 0.1894, + "step": 18630, + "task_loss": 0.4096485376358032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05841130678862614, + "compression/movement_sparsity/importance_threshold": -0.08974773895115307, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1400645673274994, + "epoch": 6.74, + "learning_rate": 1.927051474849938e-06, + "loss": 0.1758, + "step": 18640, + "task_loss": 0.45845314860343933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05843316948466218, + "compression/movement_sparsity/importance_threshold": -0.08944993672032164, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16155488789081573, + "epoch": 6.74, + "learning_rate": 1.926340328588737e-06, + "loss": 0.1759, + "step": 18650, + "task_loss": 0.44465121626853943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05845498376371872, + "compression/movement_sparsity/importance_threshold": -0.08915279400030629, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.21582606434822083, + "epoch": 6.74, + "learning_rate": 1.9256258652977727e-06, + "loss": 0.1758, + "step": 18660, + "task_loss": 0.5472663640975952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05847674967946718, + "compression/movement_sparsity/importance_threshold": -0.08885631006002304, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13278156518936157, + "epoch": 6.75, + "learning_rate": 1.924908087535388e-06, + "loss": 0.1683, + "step": 18670, + "task_loss": 0.3198769688606262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05849846728557897, + "compression/movement_sparsity/importance_threshold": -0.08856048416838802, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16864389181137085, + "epoch": 6.75, + "learning_rate": 1.924186997871794e-06, + "loss": 0.1601, + "step": 18680, + "task_loss": 0.4776211082935333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0585201366357255, + "compression/movement_sparsity/importance_threshold": -0.08826531559431705, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13846157491207123, + "epoch": 6.75, + "learning_rate": 1.9234625988890605e-06, + "loss": 0.1714, + "step": 18690, + "task_loss": 0.5768707990646362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05854175778357817, + "compression/movement_sparsity/importance_threshold": -0.0879708036067266, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18022726476192474, + "epoch": 6.76, + "learning_rate": 1.9227348931811093e-06, + "loss": 0.165, + "step": 18700, + "task_loss": 0.46484941244125366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0585633307828084, + "compression/movement_sparsity/importance_threshold": -0.08767694747453247, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1907450407743454, + "epoch": 6.76, + "learning_rate": 1.922003883353699e-06, + "loss": 0.1671, + "step": 18710, + "task_loss": 0.5632059574127197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05858485568708759, + "compression/movement_sparsity/importance_threshold": -0.08738374646665081, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15245582163333893, + "epoch": 6.77, + "learning_rate": 1.9212695720244245e-06, + "loss": 0.1807, + "step": 18720, + "task_loss": 0.5414676070213318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058606332550087174, + "compression/movement_sparsity/importance_threshold": -0.08709119985199754, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1406639814376831, + "epoch": 6.77, + "learning_rate": 1.9205319618226984e-06, + "loss": 0.1676, + "step": 18730, + "task_loss": 0.3787824213504791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05862776142547855, + "compression/movement_sparsity/importance_threshold": -0.08679930689948889, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1576058268547058, + "epoch": 6.77, + "learning_rate": 1.919791055389748e-06, + "loss": 0.1666, + "step": 18740, + "task_loss": 0.5415380001068115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05864914236693311, + "compression/movement_sparsity/importance_threshold": -0.08650806687804102, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1747387945652008, + "epoch": 6.78, + "learning_rate": 1.919046855378604e-06, + "loss": 0.1728, + "step": 18750, + "task_loss": 0.3638462424278259 + }, + { + "epoch": 6.78, + "eval_exact_match": 83.69914853358561, + "eval_f1": 90.11730656848496, + "step": 18750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058670475428122294, + "compression/movement_sparsity/importance_threshold": -0.08621747905656973, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14378315210342407, + "epoch": 6.78, + "learning_rate": 1.918299364454089e-06, + "loss": 0.1612, + "step": 18760, + "task_loss": 0.7437037825584412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058691760662717495, + "compression/movement_sparsity/importance_threshold": -0.08592754270399128, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16446062922477722, + "epoch": 6.78, + "learning_rate": 1.917548585292811e-06, + "loss": 0.1757, + "step": 18770, + "task_loss": 0.5021160840988159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05871299812439014, + "compression/movement_sparsity/importance_threshold": -0.08563825708922157, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14091575145721436, + "epoch": 6.79, + "learning_rate": 1.9167945205831526e-06, + "loss": 0.177, + "step": 18780, + "task_loss": 0.618760883808136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05873418786681161, + "compression/movement_sparsity/importance_threshold": -0.08534962148117686, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13678541779518127, + "epoch": 6.79, + "learning_rate": 1.9160371730252607e-06, + "loss": 0.1767, + "step": 18790, + "task_loss": 0.43866440653800964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058755329943653345, + "compression/movement_sparsity/importance_threshold": -0.08506163514877318, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17583860456943512, + "epoch": 6.79, + "learning_rate": 1.9152765453310366e-06, + "loss": 0.1779, + "step": 18800, + "task_loss": 0.7005617618560791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05877642440858674, + "compression/movement_sparsity/importance_threshold": -0.08477429736092645, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14417672157287598, + "epoch": 6.8, + "learning_rate": 1.9145126402241293e-06, + "loss": 0.1687, + "step": 18810, + "task_loss": 0.5040542483329773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05879747131528321, + "compression/movement_sparsity/importance_threshold": -0.08448760738655292, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17511731386184692, + "epoch": 6.8, + "learning_rate": 1.9137454604399215e-06, + "loss": 0.1786, + "step": 18820, + "task_loss": 0.6716142892837524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05881847071741417, + "compression/movement_sparsity/importance_threshold": -0.0842015644945685, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16290408372879028, + "epoch": 6.81, + "learning_rate": 1.9129750087255232e-06, + "loss": 0.1747, + "step": 18830, + "task_loss": 0.4927728772163391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05883942266865102, + "compression/movement_sparsity/importance_threshold": -0.08391616795388934, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16758540272712708, + "epoch": 6.81, + "learning_rate": 1.9122012878397593e-06, + "loss": 0.1568, + "step": 18840, + "task_loss": 0.3878645896911621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058860327222665175, + "compression/movement_sparsity/importance_threshold": -0.08363141703343147, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14906227588653564, + "epoch": 6.81, + "learning_rate": 1.911424300553161e-06, + "loss": 0.1536, + "step": 18850, + "task_loss": 0.20304948091506958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05888118443312805, + "compression/movement_sparsity/importance_threshold": -0.08334731100211101, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1479981690645218, + "epoch": 6.82, + "learning_rate": 1.9106440496479573e-06, + "loss": 0.1643, + "step": 18860, + "task_loss": 0.6089077591896057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05890199435371106, + "compression/movement_sparsity/importance_threshold": -0.08306384912884401, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.143125981092453, + "epoch": 6.82, + "learning_rate": 1.9098605379180613e-06, + "loss": 0.1653, + "step": 18870, + "task_loss": 0.32572343945503235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058922757038085596, + "compression/movement_sparsity/importance_threshold": -0.08278103068254661, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13448187708854675, + "epoch": 6.82, + "learning_rate": 1.909073768169065e-06, + "loss": 0.1586, + "step": 18880, + "task_loss": 0.4831852912902832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058943472539923096, + "compression/movement_sparsity/importance_threshold": -0.0824988549321346, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14394140243530273, + "epoch": 6.83, + "learning_rate": 1.908283743218224e-06, + "loss": 0.1585, + "step": 18890, + "task_loss": 0.41934144496917725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05896414091289494, + "compression/movement_sparsity/importance_threshold": -0.08221732114652447, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14577503502368927, + "epoch": 6.83, + "learning_rate": 1.9074904658944524e-06, + "loss": 0.1639, + "step": 18900, + "task_loss": 0.29828184843063354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058984762210672566, + "compression/movement_sparsity/importance_threshold": -0.0819364285946318, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1624283492565155, + "epoch": 6.83, + "learning_rate": 1.9066939390383086e-06, + "loss": 0.1667, + "step": 18910, + "task_loss": 0.37106114625930786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059005336486927365, + "compression/movement_sparsity/importance_threshold": -0.08165617654537305, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1468050479888916, + "epoch": 6.84, + "learning_rate": 1.905894165501988e-06, + "loss": 0.163, + "step": 18920, + "task_loss": 0.7496539354324341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05902586379533076, + "compression/movement_sparsity/importance_threshold": -0.08137656426766415, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15505670011043549, + "epoch": 6.84, + "learning_rate": 1.9050911481493112e-06, + "loss": 0.163, + "step": 18930, + "task_loss": 0.45304474234580994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05904634418955415, + "compression/movement_sparsity/importance_threshold": -0.08109759103042113, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17738257348537445, + "epoch": 6.84, + "learning_rate": 1.9042848898557145e-06, + "loss": 0.1714, + "step": 18940, + "task_loss": 0.4849020838737488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05906677772326896, + "compression/movement_sparsity/importance_threshold": -0.08081925610256013, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12414862960577011, + "epoch": 6.85, + "learning_rate": 1.903475393508239e-06, + "loss": 0.1536, + "step": 18950, + "task_loss": 0.5212274789810181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05908716445014659, + "compression/movement_sparsity/importance_threshold": -0.08054155875299718, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16516438126564026, + "epoch": 6.85, + "learning_rate": 1.9026626620055208e-06, + "loss": 0.1734, + "step": 18960, + "task_loss": 0.3675074577331543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059107504423858453, + "compression/movement_sparsity/importance_threshold": -0.0802644982506483, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13828526437282562, + "epoch": 6.86, + "learning_rate": 1.9018466982577802e-06, + "loss": 0.1617, + "step": 18970, + "task_loss": 0.44366180896759033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05912779769807596, + "compression/movement_sparsity/importance_threshold": -0.07998807386442963, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1504247784614563, + "epoch": 6.86, + "learning_rate": 1.9010275051868123e-06, + "loss": 0.1575, + "step": 18980, + "task_loss": 0.5267570614814758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05914804432647053, + "compression/movement_sparsity/importance_threshold": -0.07971228486325721, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1422831118106842, + "epoch": 6.86, + "learning_rate": 1.9002050857259743e-06, + "loss": 0.1798, + "step": 18990, + "task_loss": 0.38829371333122253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059168244362713555, + "compression/movement_sparsity/importance_threshold": -0.07943713051604717, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14333003759384155, + "epoch": 6.87, + "learning_rate": 1.8993794428201775e-06, + "loss": 0.1553, + "step": 19000, + "task_loss": 0.5657933950424194 + }, + { + "epoch": 6.87, + "eval_exact_match": 83.60454115421003, + "eval_f1": 89.96767923298626, + "step": 19000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059188397860476465, + "compression/movement_sparsity/importance_threshold": -0.07916261009171532, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16756048798561096, + "epoch": 6.87, + "learning_rate": 1.8985505794258754e-06, + "loss": 0.1683, + "step": 19010, + "task_loss": 0.4580082893371582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059208504873430656, + "compression/movement_sparsity/importance_threshold": -0.07888872285917803, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13767385482788086, + "epoch": 6.87, + "learning_rate": 1.8977184985110535e-06, + "loss": 0.1644, + "step": 19020, + "task_loss": 0.33297887444496155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059228565455247545, + "compression/movement_sparsity/importance_threshold": -0.07861546808735143, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1662845015525818, + "epoch": 6.88, + "learning_rate": 1.8968832030552182e-06, + "loss": 0.1624, + "step": 19030, + "task_loss": 0.7603532671928406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05924857965959853, + "compression/movement_sparsity/importance_threshold": -0.07834284504515132, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14940595626831055, + "epoch": 6.88, + "learning_rate": 1.8960446960493872e-06, + "loss": 0.168, + "step": 19040, + "task_loss": 0.357892781496048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05926854754015506, + "compression/movement_sparsity/importance_threshold": -0.07807085300149363, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1585623174905777, + "epoch": 6.88, + "learning_rate": 1.895202980496077e-06, + "loss": 0.172, + "step": 19050, + "task_loss": 0.2738313674926758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059288469150588496, + "compression/movement_sparsity/importance_threshold": -0.07779949122529495, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14924949407577515, + "epoch": 6.89, + "learning_rate": 1.8943580594092942e-06, + "loss": 0.1706, + "step": 19060, + "task_loss": 0.4149706959724426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059308344544570284, + "compression/movement_sparsity/importance_threshold": -0.07752875898547096, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12119010835886002, + "epoch": 6.89, + "learning_rate": 1.8935099358145233e-06, + "loss": 0.1463, + "step": 19070, + "task_loss": 0.31337571144104004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05932817377577181, + "compression/movement_sparsity/importance_threshold": -0.0772586555509378, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13046619296073914, + "epoch": 6.9, + "learning_rate": 1.8926586127487165e-06, + "loss": 0.1732, + "step": 19080, + "task_loss": 0.577675461769104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05934795689786451, + "compression/movement_sparsity/importance_threshold": -0.07698918019061152, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13301792740821838, + "epoch": 6.9, + "learning_rate": 1.8918040932602822e-06, + "loss": 0.1594, + "step": 19090, + "task_loss": 0.38307079672813416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059367693964519776, + "compression/movement_sparsity/importance_threshold": -0.07672033217340835, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15103420615196228, + "epoch": 6.9, + "learning_rate": 1.8909463804090753e-06, + "loss": 0.1592, + "step": 19100, + "task_loss": 0.5020593404769897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059387385029409025, + "compression/movement_sparsity/importance_threshold": -0.0764521107682441, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14066827297210693, + "epoch": 6.91, + "learning_rate": 1.890085477266385e-06, + "loss": 0.1688, + "step": 19110, + "task_loss": 0.2139877825975418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05940703014620367, + "compression/movement_sparsity/importance_threshold": -0.07618451524403502, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1647796630859375, + "epoch": 6.91, + "learning_rate": 1.8892213869149238e-06, + "loss": 0.1717, + "step": 19120, + "task_loss": 0.4196796417236328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05942662936857511, + "compression/movement_sparsity/importance_threshold": -0.07591754486969715, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17903751134872437, + "epoch": 6.91, + "learning_rate": 1.8883541124488178e-06, + "loss": 0.1747, + "step": 19130, + "task_loss": 0.399991512298584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059446182750194775, + "compression/movement_sparsity/importance_threshold": -0.07565119891414651, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13912296295166016, + "epoch": 6.92, + "learning_rate": 1.8874836569735942e-06, + "loss": 0.1705, + "step": 19140, + "task_loss": 0.6287709474563599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05946569034473405, + "compression/movement_sparsity/importance_threshold": -0.07538547664629924, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14840209484100342, + "epoch": 6.92, + "learning_rate": 1.8866100236061708e-06, + "loss": 0.1693, + "step": 19150, + "task_loss": 0.45292603969573975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05948515220586437, + "compression/movement_sparsity/importance_threshold": -0.07512037733507126, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1409136801958084, + "epoch": 6.92, + "learning_rate": 1.885733215474845e-06, + "loss": 0.1563, + "step": 19160, + "task_loss": 0.23241209983825684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05950456838725712, + "compression/movement_sparsity/importance_threshold": -0.07485590024937894, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2178533971309662, + "epoch": 6.93, + "learning_rate": 1.8848532357192824e-06, + "loss": 0.1765, + "step": 19170, + "task_loss": 0.4487258791923523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059523938942583746, + "compression/movement_sparsity/importance_threshold": -0.07459204465813785, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13762247562408447, + "epoch": 6.93, + "learning_rate": 1.8839700874905046e-06, + "loss": 0.169, + "step": 19180, + "task_loss": 0.3723566234111786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059543263925515635, + "compression/movement_sparsity/importance_threshold": -0.07432880983026457, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17009657621383667, + "epoch": 6.94, + "learning_rate": 1.8830837739508802e-06, + "loss": 0.1631, + "step": 19190, + "task_loss": 0.23757542669773102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05956254338972419, + "compression/movement_sparsity/importance_threshold": -0.07406619503467493, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19928023219108582, + "epoch": 6.94, + "learning_rate": 1.8821942982741113e-06, + "loss": 0.1832, + "step": 19200, + "task_loss": 0.39194488525390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05958177738888084, + "compression/movement_sparsity/importance_threshold": -0.07380419954028494, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13645857572555542, + "epoch": 6.94, + "learning_rate": 1.8813016636452228e-06, + "loss": 0.155, + "step": 19210, + "task_loss": 0.3137187361717224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059600965976657, + "compression/movement_sparsity/importance_threshold": -0.07354282261601064, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1318303644657135, + "epoch": 6.95, + "learning_rate": 1.8804058732605516e-06, + "loss": 0.1635, + "step": 19220, + "task_loss": 0.3633446991443634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05962010920672406, + "compression/movement_sparsity/importance_threshold": -0.07328206353076827, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1735188364982605, + "epoch": 6.95, + "learning_rate": 1.879506930327735e-06, + "loss": 0.1601, + "step": 19230, + "task_loss": 0.5617966651916504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05963920713275343, + "compression/movement_sparsity/importance_threshold": -0.07302192155347387, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1602572202682495, + "epoch": 6.95, + "learning_rate": 1.8786048380656979e-06, + "loss": 0.1736, + "step": 19240, + "task_loss": 0.35640949010849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05965825980841654, + "compression/movement_sparsity/importance_threshold": -0.07276239595304346, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17654402554035187, + "epoch": 6.96, + "learning_rate": 1.8776995997046424e-06, + "loss": 0.1577, + "step": 19250, + "task_loss": 0.3391542434692383 + }, + { + "epoch": 6.96, + "eval_exact_match": 83.50047303689688, + "eval_f1": 89.92843125598556, + "step": 19250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05967726728738478, + "compression/movement_sparsity/importance_threshold": -0.07250348599839318, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12063628435134888, + "epoch": 6.96, + "learning_rate": 1.876791218486038e-06, + "loss": 0.1641, + "step": 19260, + "task_loss": 0.405428946018219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059696229623329586, + "compression/movement_sparsity/importance_threshold": -0.07224519095843895, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.148264080286026, + "epoch": 6.96, + "learning_rate": 1.8758796976626056e-06, + "loss": 0.1597, + "step": 19270, + "task_loss": 0.3504221439361572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05971514686992235, + "compression/movement_sparsity/importance_threshold": -0.07198751010209681, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18574409186840057, + "epoch": 6.97, + "learning_rate": 1.8749650404983096e-06, + "loss": 0.1745, + "step": 19280, + "task_loss": 0.5480473041534424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05973401908083449, + "compression/movement_sparsity/importance_threshold": -0.07173044269828299, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1708531677722931, + "epoch": 6.97, + "learning_rate": 1.8740472502683445e-06, + "loss": 0.1658, + "step": 19290, + "task_loss": 0.329237699508667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05975284630973741, + "compression/movement_sparsity/importance_threshold": -0.07147398801591365, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14285686612129211, + "epoch": 6.98, + "learning_rate": 1.8731263302591249e-06, + "loss": 0.1752, + "step": 19300, + "task_loss": 0.46866947412490845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05977162861030252, + "compression/movement_sparsity/importance_threshold": -0.07121814532390458, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.149366557598114, + "epoch": 6.98, + "learning_rate": 1.8722022837682707e-06, + "loss": 0.1703, + "step": 19310, + "task_loss": 0.5933884382247925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059790366036201234, + "compression/movement_sparsity/importance_threshold": -0.07096291389117193, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16207867860794067, + "epoch": 6.98, + "learning_rate": 1.8712751141045983e-06, + "loss": 0.1811, + "step": 19320, + "task_loss": 0.36161190271377563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05980905864110496, + "compression/movement_sparsity/importance_threshold": -0.07070829298663195, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17707979679107666, + "epoch": 6.99, + "learning_rate": 1.8703448245881071e-06, + "loss": 0.1718, + "step": 19330, + "task_loss": 0.3630777597427368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05982770647868512, + "compression/movement_sparsity/importance_threshold": -0.07045428187920033, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14741402864456177, + "epoch": 6.99, + "learning_rate": 1.8694114185499679e-06, + "loss": 0.172, + "step": 19340, + "task_loss": 0.31712833046913147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05984630960261311, + "compression/movement_sparsity/importance_threshold": -0.07020087983779355, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13535742461681366, + "epoch": 6.99, + "learning_rate": 1.8684748993325111e-06, + "loss": 0.1702, + "step": 19350, + "task_loss": 0.33675703406333923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05986486806656035, + "compression/movement_sparsity/importance_threshold": -0.06994808613132752, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16717709600925446, + "epoch": 7.0, + "learning_rate": 1.8675352702892155e-06, + "loss": 0.1711, + "step": 19360, + "task_loss": 0.6665349006652832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05988338192419825, + "compression/movement_sparsity/importance_threshold": -0.06969590002871806, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17918677628040314, + "epoch": 7.0, + "learning_rate": 1.866592534784695e-06, + "loss": 0.1832, + "step": 19370, + "task_loss": 0.6708546876907349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059901851229198215, + "compression/movement_sparsity/importance_threshold": -0.06944432079888163, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14213892817497253, + "epoch": 7.0, + "learning_rate": 1.8656466961946862e-06, + "loss": 0.169, + "step": 19380, + "task_loss": 0.372379332780838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05992027603523166, + "compression/movement_sparsity/importance_threshold": -0.06919334771073415, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17070825397968292, + "epoch": 7.01, + "learning_rate": 1.8646977579060389e-06, + "loss": 0.1609, + "step": 19390, + "task_loss": 0.3748677968978882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05993865639596999, + "compression/movement_sparsity/importance_threshold": -0.06894298003319155, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2126743048429489, + "epoch": 7.01, + "learning_rate": 1.8637457233167005e-06, + "loss": 0.1668, + "step": 19400, + "task_loss": 0.5745400190353394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05995699236508463, + "compression/movement_sparsity/importance_threshold": -0.06869321703517006, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.130209818482399, + "epoch": 7.01, + "learning_rate": 1.8627905958357073e-06, + "loss": 0.1665, + "step": 19410, + "task_loss": 0.34854692220687866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05997528399624697, + "compression/movement_sparsity/importance_threshold": -0.06844405798558584, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15571898221969604, + "epoch": 7.02, + "learning_rate": 1.8618323788831697e-06, + "loss": 0.1792, + "step": 19420, + "task_loss": 0.6182477474212646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05999353134312843, + "compression/movement_sparsity/importance_threshold": -0.06819550215335468, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1204761266708374, + "epoch": 7.02, + "learning_rate": 1.8608710758902607e-06, + "loss": 0.1644, + "step": 19430, + "task_loss": 0.415132999420166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06001173445940043, + "compression/movement_sparsity/importance_threshold": -0.06794754880739273, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18349982798099518, + "epoch": 7.03, + "learning_rate": 1.859906690299204e-06, + "loss": 0.1842, + "step": 19440, + "task_loss": 0.5370660424232483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06002989339873437, + "compression/movement_sparsity/importance_threshold": -0.06770019721661613, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1359555423259735, + "epoch": 7.03, + "learning_rate": 1.8589392255632617e-06, + "loss": 0.1724, + "step": 19450, + "task_loss": 0.41707509756088257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060048008214801656, + "compression/movement_sparsity/importance_threshold": -0.06745344664994113, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15165475010871887, + "epoch": 7.03, + "learning_rate": 1.857968685146721e-06, + "loss": 0.1643, + "step": 19460, + "task_loss": 0.43051207065582275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06006607896127371, + "compression/movement_sparsity/importance_threshold": -0.06720729637628331, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1730663776397705, + "epoch": 7.04, + "learning_rate": 1.8569950725248831e-06, + "loss": 0.1654, + "step": 19470, + "task_loss": 0.5514428615570068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06008410569182194, + "compression/movement_sparsity/importance_threshold": -0.06696174566455915, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15489190816879272, + "epoch": 7.04, + "learning_rate": 1.85601839118405e-06, + "loss": 0.1711, + "step": 19480, + "task_loss": 0.3652356266975403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06010208846011775, + "compression/movement_sparsity/importance_threshold": -0.06671679378368456, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12107902020215988, + "epoch": 7.04, + "learning_rate": 1.8550386446215121e-06, + "loss": 0.1532, + "step": 19490, + "task_loss": 0.49454283714294434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060120027319832556, + "compression/movement_sparsity/importance_threshold": -0.06647244000257568, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1640836000442505, + "epoch": 7.05, + "learning_rate": 1.8540558363455353e-06, + "loss": 0.1532, + "step": 19500, + "task_loss": 0.3784339427947998 + }, + { + "epoch": 7.05, + "eval_exact_match": 83.57615894039735, + "eval_f1": 89.98257156583203, + "step": 19500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06013792232463777, + "compression/movement_sparsity/importance_threshold": -0.06622868359014844, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1305304914712906, + "epoch": 7.05, + "learning_rate": 1.8530699698753494e-06, + "loss": 0.1526, + "step": 19510, + "task_loss": 0.4855495095252991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0601557735282048, + "compression/movement_sparsity/importance_threshold": -0.06598552381531908, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.121201291680336, + "epoch": 7.05, + "learning_rate": 1.8520810487411347e-06, + "loss": 0.1668, + "step": 19520, + "task_loss": 0.5691750049591064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060173580984205055, + "compression/movement_sparsity/importance_threshold": -0.06574295994700352, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14304503798484802, + "epoch": 7.06, + "learning_rate": 1.8510890764840098e-06, + "loss": 0.1623, + "step": 19530, + "task_loss": 0.4884259104728699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060191344746309945, + "compression/movement_sparsity/importance_threshold": -0.06550099125411801, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1405264437198639, + "epoch": 7.06, + "learning_rate": 1.8500940566560187e-06, + "loss": 0.1557, + "step": 19540, + "task_loss": 0.7580517530441284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060209064868190894, + "compression/movement_sparsity/importance_threshold": -0.06525961700557836, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14326691627502441, + "epoch": 7.07, + "learning_rate": 1.8490959928201173e-06, + "loss": 0.1639, + "step": 19550, + "task_loss": 0.7660905122756958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06022674140351929, + "compression/movement_sparsity/importance_threshold": -0.06501883647030082, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13954105973243713, + "epoch": 7.07, + "learning_rate": 1.8480948885501627e-06, + "loss": 0.1734, + "step": 19560, + "task_loss": 0.5161733627319336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06024437440596656, + "compression/movement_sparsity/importance_threshold": -0.06477864891720131, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1333414614200592, + "epoch": 7.07, + "learning_rate": 1.847090747430899e-06, + "loss": 0.1631, + "step": 19570, + "task_loss": 0.24418434500694275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0602619639292041, + "compression/movement_sparsity/importance_threshold": -0.0645390536151963, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13161346316337585, + "epoch": 7.08, + "learning_rate": 1.8460835730579434e-06, + "loss": 0.1733, + "step": 19580, + "task_loss": 0.5065572261810303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06027951002690334, + "compression/movement_sparsity/importance_threshold": -0.06430004983320126, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12809817492961884, + "epoch": 7.08, + "learning_rate": 1.8450733690377757e-06, + "loss": 0.1826, + "step": 19590, + "task_loss": 0.36705049872398376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06029701275273568, + "compression/movement_sparsity/importance_threshold": -0.06406163684013266, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1483440101146698, + "epoch": 7.08, + "learning_rate": 1.8440601389877241e-06, + "loss": 0.1648, + "step": 19600, + "task_loss": 0.2816522419452667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060314472160372536, + "compression/movement_sparsity/importance_threshold": -0.06382381390490643, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16983579099178314, + "epoch": 7.09, + "learning_rate": 1.843043886535952e-06, + "loss": 0.1687, + "step": 19610, + "task_loss": 0.5111091136932373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060331888303485307, + "compression/movement_sparsity/importance_threshold": -0.06358658029643882, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1196693629026413, + "epoch": 7.09, + "learning_rate": 1.8420246153214451e-06, + "loss": 0.154, + "step": 19620, + "task_loss": 0.24485935270786285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06034926123574542, + "compression/movement_sparsity/importance_threshold": -0.06334993528364552, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11787790060043335, + "epoch": 7.09, + "learning_rate": 1.841002328994e-06, + "loss": 0.1685, + "step": 19630, + "task_loss": 0.6096109747886658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06036659101082427, + "compression/movement_sparsity/importance_threshold": -0.06311387813544289, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1214839369058609, + "epoch": 7.1, + "learning_rate": 1.8399770312142082e-06, + "loss": 0.1656, + "step": 19640, + "task_loss": 0.24309919774532318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06038387768239327, + "compression/movement_sparsity/importance_threshold": -0.06287840812074696, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12063048779964447, + "epoch": 7.1, + "learning_rate": 1.8389487256534456e-06, + "loss": 0.163, + "step": 19650, + "task_loss": 0.29860949516296387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06040112130412384, + "compression/movement_sparsity/importance_threshold": -0.06264352450847377, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18652105331420898, + "epoch": 7.11, + "learning_rate": 1.8379174159938578e-06, + "loss": 0.1688, + "step": 19660, + "task_loss": 0.5233631134033203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06041832192968739, + "compression/movement_sparsity/importance_threshold": -0.06240922656753922, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1551060527563095, + "epoch": 7.11, + "learning_rate": 1.8368831059283476e-06, + "loss": 0.1645, + "step": 19670, + "task_loss": 0.529505729675293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060435479612755305, + "compression/movement_sparsity/importance_threshold": -0.0621755135668598, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15519507229328156, + "epoch": 7.11, + "learning_rate": 1.835845799160562e-06, + "loss": 0.1601, + "step": 19680, + "task_loss": 0.6162427067756653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06045259440699903, + "compression/movement_sparsity/importance_threshold": -0.06194238477535119, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1276751011610031, + "epoch": 7.12, + "learning_rate": 1.8348054994048783e-06, + "loss": 0.1812, + "step": 19690, + "task_loss": 0.36186686158180237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06046966636608997, + "compression/movement_sparsity/importance_threshold": -0.06170983946192954, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1895449459552765, + "epoch": 7.12, + "learning_rate": 1.8337622103863906e-06, + "loss": 0.1648, + "step": 19700, + "task_loss": 0.6113171577453613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06048669554369951, + "compression/movement_sparsity/importance_threshold": -0.0614778768955111, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12751251459121704, + "epoch": 7.12, + "learning_rate": 1.832715935840897e-06, + "loss": 0.1482, + "step": 19710, + "task_loss": 0.271990031003952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06050368199349909, + "compression/movement_sparsity/importance_threshold": -0.06124649634501167, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1760406345129013, + "epoch": 7.13, + "learning_rate": 1.8316666795148873e-06, + "loss": 0.175, + "step": 19720, + "task_loss": 0.5232373476028442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06052062576916011, + "compression/movement_sparsity/importance_threshold": -0.0610156970793474, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1406281739473343, + "epoch": 7.13, + "learning_rate": 1.8306144451655273e-06, + "loss": 0.1719, + "step": 19730, + "task_loss": 0.2717032730579376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06053752692435397, + "compression/movement_sparsity/importance_threshold": -0.06078547836743464, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14542102813720703, + "epoch": 7.13, + "learning_rate": 1.8295592365606462e-06, + "loss": 0.1952, + "step": 19740, + "task_loss": 0.4864187240600586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0605543855127521, + "compression/movement_sparsity/importance_threshold": -0.06055583947818899, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15917694568634033, + "epoch": 7.14, + "learning_rate": 1.8285010574787249e-06, + "loss": 0.1618, + "step": 19750, + "task_loss": 0.5574923753738403 + }, + { + "epoch": 7.14, + "eval_exact_match": 83.54777672658467, + "eval_f1": 90.08027567236861, + "step": 19750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0605712015880259, + "compression/movement_sparsity/importance_threshold": -0.06032677968052669, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19061726331710815, + "epoch": 7.14, + "learning_rate": 1.827439911708879e-06, + "loss": 0.1815, + "step": 19760, + "task_loss": 0.5931074023246765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060587975203846775, + "compression/movement_sparsity/importance_threshold": -0.06009829824336399, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14040836691856384, + "epoch": 7.14, + "learning_rate": 1.8263758030508489e-06, + "loss": 0.173, + "step": 19770, + "task_loss": 0.8596678972244263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06060470641388614, + "compression/movement_sparsity/importance_threshold": -0.059870394435616814, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1635446846485138, + "epoch": 7.15, + "learning_rate": 1.8253087353149833e-06, + "loss": 0.1812, + "step": 19780, + "task_loss": 0.29845842719078064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06062139527181541, + "compression/movement_sparsity/importance_threshold": -0.05964306752620119, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1550963819026947, + "epoch": 7.15, + "learning_rate": 1.8242387123222275e-06, + "loss": 0.1731, + "step": 19790, + "task_loss": 0.542272686958313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060638041831306, + "compression/movement_sparsity/importance_threshold": -0.05941631678403325, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12241744995117188, + "epoch": 7.16, + "learning_rate": 1.8231657379041089e-06, + "loss": 0.1505, + "step": 19800, + "task_loss": 0.41791096329689026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060654646146029306, + "compression/movement_sparsity/importance_threshold": -0.059190141478029146, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12771350145339966, + "epoch": 7.16, + "learning_rate": 1.8220898159027223e-06, + "loss": 0.1572, + "step": 19810, + "task_loss": 0.4617471396923065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060671208269656746, + "compression/movement_sparsity/importance_threshold": -0.05896454087710479, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1922406703233719, + "epoch": 7.16, + "learning_rate": 1.8210109501707184e-06, + "loss": 0.1773, + "step": 19820, + "task_loss": 0.39758336544036865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06068772825585973, + "compression/movement_sparsity/importance_threshold": -0.058739514250176206, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1719903200864792, + "epoch": 7.17, + "learning_rate": 1.8199291445712883e-06, + "loss": 0.1861, + "step": 19830, + "task_loss": 0.5382665395736694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060704206158309675, + "compression/movement_sparsity/importance_threshold": -0.05851506086615965, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14048738777637482, + "epoch": 7.17, + "learning_rate": 1.8188444029781502e-06, + "loss": 0.1595, + "step": 19840, + "task_loss": 0.169685959815979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06072064203067798, + "compression/movement_sparsity/importance_threshold": -0.05829117999397104, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1779119223356247, + "epoch": 7.17, + "learning_rate": 1.8177567292755352e-06, + "loss": 0.167, + "step": 19850, + "task_loss": 0.4589795470237732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06073703592663606, + "compression/movement_sparsity/importance_threshold": -0.05806787090252663, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15580281615257263, + "epoch": 7.18, + "learning_rate": 1.8166661273581744e-06, + "loss": 0.1726, + "step": 19860, + "task_loss": 0.2717253267765045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06075338789985533, + "compression/movement_sparsity/importance_threshold": -0.05784513286074233, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17201679944992065, + "epoch": 7.18, + "learning_rate": 1.8155726011312838e-06, + "loss": 0.1703, + "step": 19870, + "task_loss": 0.46242088079452515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060769698004007194, + "compression/movement_sparsity/importance_threshold": -0.05762296513753418, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14266745746135712, + "epoch": 7.18, + "learning_rate": 1.8144761545105498e-06, + "loss": 0.16, + "step": 19880, + "task_loss": 0.3505728542804718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060785966292763075, + "compression/movement_sparsity/importance_threshold": -0.05740136700181819, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19506075978279114, + "epoch": 7.19, + "learning_rate": 1.8133767914221179e-06, + "loss": 0.159, + "step": 19890, + "task_loss": 0.731766939163208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06080219281979437, + "compression/movement_sparsity/importance_threshold": -0.05718033772251074, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14748713374137878, + "epoch": 7.19, + "learning_rate": 1.8122745158025756e-06, + "loss": 0.1759, + "step": 19900, + "task_loss": 0.5381277203559875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060818377638772486, + "compression/movement_sparsity/importance_threshold": -0.056959876568527634, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1392291784286499, + "epoch": 7.2, + "learning_rate": 1.81116933159894e-06, + "loss": 0.1629, + "step": 19910, + "task_loss": 0.3705242872238159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060834520803368854, + "compression/movement_sparsity/importance_threshold": -0.056739982808785006, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13060082495212555, + "epoch": 7.2, + "learning_rate": 1.810061242768643e-06, + "loss": 0.1549, + "step": 19920, + "task_loss": 0.4211186468601227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06085062236725487, + "compression/movement_sparsity/importance_threshold": -0.056520655712198775, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12718503177165985, + "epoch": 7.2, + "learning_rate": 1.8089502532795175e-06, + "loss": 0.1536, + "step": 19930, + "task_loss": 0.36818772554397583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060866682384101936, + "compression/movement_sparsity/importance_threshold": -0.056301894547685305, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14205113053321838, + "epoch": 7.21, + "learning_rate": 1.807836367109783e-06, + "loss": 0.1711, + "step": 19940, + "task_loss": 0.579256534576416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06088270090758148, + "compression/movement_sparsity/importance_threshold": -0.05608369858416051, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1530269980430603, + "epoch": 7.21, + "learning_rate": 1.8067195882480321e-06, + "loss": 0.1737, + "step": 19950, + "task_loss": 0.43539872765541077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06089867799136491, + "compression/movement_sparsity/importance_threshold": -0.05586606709054043, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18445491790771484, + "epoch": 7.21, + "learning_rate": 1.805599920693214e-06, + "loss": 0.1709, + "step": 19960, + "task_loss": 0.651823103427887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06091461368912364, + "compression/movement_sparsity/importance_threshold": -0.05564899933574119, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12383359670639038, + "epoch": 7.22, + "learning_rate": 1.8044773684546228e-06, + "loss": 0.1732, + "step": 19970, + "task_loss": 0.5851423740386963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06093050805452906, + "compression/movement_sparsity/importance_threshold": -0.055432494588678716, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1711319088935852, + "epoch": 7.22, + "learning_rate": 1.8033519355518822e-06, + "loss": 0.1678, + "step": 19980, + "task_loss": 0.5596441030502319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060946361141252604, + "compression/movement_sparsity/importance_threshold": -0.05521655211826926, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15698492527008057, + "epoch": 7.22, + "learning_rate": 1.8022236260149303e-06, + "loss": 0.1647, + "step": 19990, + "task_loss": 0.33208391070365906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.060962173002965664, + "compression/movement_sparsity/importance_threshold": -0.055001171193428844, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15464134514331818, + "epoch": 7.23, + "learning_rate": 1.8010924438840057e-06, + "loss": 0.1562, + "step": 20000, + "task_loss": 0.3163996636867523 + }, + { + "epoch": 7.23, + "eval_exact_match": 83.44370860927152, + "eval_f1": 89.87271615204905, + "step": 20000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06097794369333966, + "compression/movement_sparsity/importance_threshold": -0.05478635108307339, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16933295130729675, + "epoch": 7.23, + "learning_rate": 1.7999583932096346e-06, + "loss": 0.1647, + "step": 20010, + "task_loss": 0.45220547914505005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06099367326604601, + "compression/movement_sparsity/importance_threshold": -0.054572091056119154, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14466992020606995, + "epoch": 7.24, + "learning_rate": 1.7988214780526128e-06, + "loss": 0.1618, + "step": 20020, + "task_loss": 0.282554566860199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06100936177475611, + "compression/movement_sparsity/importance_threshold": -0.05435839038148216, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13395962119102478, + "epoch": 7.24, + "learning_rate": 1.7976817024839943e-06, + "loss": 0.1693, + "step": 20030, + "task_loss": 0.6613667011260986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061025009273141376, + "compression/movement_sparsity/importance_threshold": -0.05414524832807843, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.22142064571380615, + "epoch": 7.24, + "learning_rate": 1.796539070585076e-06, + "loss": 0.1735, + "step": 20040, + "task_loss": 0.6772729754447937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06104061581487323, + "compression/movement_sparsity/importance_threshold": -0.053932664164823896, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.145456463098526, + "epoch": 7.25, + "learning_rate": 1.7953935864473823e-06, + "loss": 0.1496, + "step": 20050, + "task_loss": 0.14888328313827515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061056181453623064, + "compression/movement_sparsity/importance_threshold": -0.05372063716063491, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14171883463859558, + "epoch": 7.25, + "learning_rate": 1.7942452541726505e-06, + "loss": 0.1598, + "step": 20060, + "task_loss": 0.5084316730499268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06107170624306229, + "compression/movement_sparsity/importance_threshold": -0.05350916658442739, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15908685326576233, + "epoch": 7.25, + "learning_rate": 1.7930940778728165e-06, + "loss": 0.1678, + "step": 20070, + "task_loss": 0.8238155841827393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06108719023686233, + "compression/movement_sparsity/importance_threshold": -0.05329825170511748, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15507642924785614, + "epoch": 7.26, + "learning_rate": 1.791940061670001e-06, + "loss": 0.158, + "step": 20080, + "task_loss": 0.4280283451080322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0611026334886946, + "compression/movement_sparsity/importance_threshold": -0.05308789179162099, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14263886213302612, + "epoch": 7.26, + "learning_rate": 1.790783209696493e-06, + "loss": 0.1545, + "step": 20090, + "task_loss": 0.33214807510375977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061118036052230484, + "compression/movement_sparsity/importance_threshold": -0.05287808611285438, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16570046544075012, + "epoch": 7.26, + "learning_rate": 1.789623526094736e-06, + "loss": 0.1755, + "step": 20100, + "task_loss": 0.5707616209983826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061133397981141424, + "compression/movement_sparsity/importance_threshold": -0.05266883393773336, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1842001974582672, + "epoch": 7.27, + "learning_rate": 1.7884610150173121e-06, + "loss": 0.1682, + "step": 20110, + "task_loss": 0.5207316279411316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06114871932909881, + "compression/movement_sparsity/importance_threshold": -0.052460134535174285, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16815443336963654, + "epoch": 7.27, + "learning_rate": 1.78729568062693e-06, + "loss": 0.1692, + "step": 20120, + "task_loss": 0.801066517829895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06116400014977406, + "compression/movement_sparsity/importance_threshold": -0.05225198717409296, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1244630366563797, + "epoch": 7.28, + "learning_rate": 1.7861275270964063e-06, + "loss": 0.1751, + "step": 20130, + "task_loss": 0.18696001172065735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06117924049683858, + "compression/movement_sparsity/importance_threshold": -0.05204439112340564, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16450370848178864, + "epoch": 7.28, + "learning_rate": 1.7849565586086527e-06, + "loss": 0.1742, + "step": 20140, + "task_loss": 0.5837537050247192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06119444042396378, + "compression/movement_sparsity/importance_threshold": -0.051837345652028355, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1398741602897644, + "epoch": 7.28, + "learning_rate": 1.7837827793566615e-06, + "loss": 0.1666, + "step": 20150, + "task_loss": 0.35871630907058716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061209599984821075, + "compression/movement_sparsity/importance_threshold": -0.05163085002887713, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19207346439361572, + "epoch": 7.29, + "learning_rate": 1.7826061935434892e-06, + "loss": 0.1716, + "step": 20160, + "task_loss": 0.5226284861564636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061224719233081884, + "compression/movement_sparsity/importance_threshold": -0.05142490352286799, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1372203230857849, + "epoch": 7.29, + "learning_rate": 1.781426805382241e-06, + "loss": 0.1636, + "step": 20170, + "task_loss": 0.514137864112854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0612397982224176, + "compression/movement_sparsity/importance_threshold": -0.05121950540291709, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1332128494977951, + "epoch": 7.29, + "learning_rate": 1.780244619096059e-06, + "loss": 0.1751, + "step": 20180, + "task_loss": 0.2380894273519516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06125483700649964, + "compression/movement_sparsity/importance_threshold": -0.05101465493794055, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1710924506187439, + "epoch": 7.3, + "learning_rate": 1.7790596389181026e-06, + "loss": 0.1642, + "step": 20190, + "task_loss": 0.2540561854839325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06126983563899942, + "compression/movement_sparsity/importance_threshold": -0.050810351396854414, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15287163853645325, + "epoch": 7.3, + "learning_rate": 1.7778718690915366e-06, + "loss": 0.1765, + "step": 20200, + "task_loss": 0.2441101372241974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06128479417358835, + "compression/movement_sparsity/importance_threshold": -0.05060659404857448, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17295891046524048, + "epoch": 7.3, + "learning_rate": 1.776681313869515e-06, + "loss": 0.1763, + "step": 20210, + "task_loss": 0.40276211500167847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061299712663937835, + "compression/movement_sparsity/importance_threshold": -0.05040338216201712, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1395740807056427, + "epoch": 7.31, + "learning_rate": 1.7754879775151655e-06, + "loss": 0.1655, + "step": 20220, + "task_loss": 0.5500630140304565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06131459116371929, + "compression/movement_sparsity/importance_threshold": -0.05020071500609835, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16768819093704224, + "epoch": 7.31, + "learning_rate": 1.774291864301574e-06, + "loss": 0.1599, + "step": 20230, + "task_loss": 0.3271426856517792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061329429726604115, + "compression/movement_sparsity/importance_threshold": -0.04999859184973421, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12167343497276306, + "epoch": 7.31, + "learning_rate": 1.7730929785117707e-06, + "loss": 0.1561, + "step": 20240, + "task_loss": 0.19313254952430725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061344228406263746, + "compression/movement_sparsity/importance_threshold": -0.04979701196184061, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15813778340816498, + "epoch": 7.32, + "learning_rate": 1.7718913244387133e-06, + "loss": 0.1774, + "step": 20250, + "task_loss": 0.46160903573036194 + }, + { + "epoch": 7.32, + "eval_exact_match": 83.61400189214758, + "eval_f1": 89.96683277095251, + "step": 20250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06135898725636957, + "compression/movement_sparsity/importance_threshold": -0.049595974611333804, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1122448593378067, + "epoch": 7.32, + "learning_rate": 1.7706869063852716e-06, + "loss": 0.1572, + "step": 20260, + "task_loss": 0.2982158660888672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061373706330593, + "compression/movement_sparsity/importance_threshold": -0.04939547906712993, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15496289730072021, + "epoch": 7.33, + "learning_rate": 1.7694797286642137e-06, + "loss": 0.1866, + "step": 20270, + "task_loss": 0.3803407549858093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06138838568260545, + "compression/movement_sparsity/importance_threshold": -0.04919552459814491, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19763313233852386, + "epoch": 7.33, + "learning_rate": 1.7683909127719155e-06, + "loss": 0.1794, + "step": 20280, + "task_loss": 0.3198865056037903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061403025366078334, + "compression/movement_sparsity/importance_threshold": -0.048996110473294774, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14763984084129333, + "epoch": 7.33, + "learning_rate": 1.7671785035994402e-06, + "loss": 0.1589, + "step": 20290, + "task_loss": 0.3247777223587036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061417625434683064, + "compression/movement_sparsity/importance_threshold": -0.048797235961495655, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16530966758728027, + "epoch": 7.34, + "learning_rate": 1.7659633473222004e-06, + "loss": 0.1628, + "step": 20300, + "task_loss": 0.3593147397041321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061432185942091044, + "compression/movement_sparsity/importance_threshold": -0.048598900331663586, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18628555536270142, + "epoch": 7.34, + "learning_rate": 1.7647454482914155e-06, + "loss": 0.1713, + "step": 20310, + "task_loss": 0.42950117588043213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06144670694197368, + "compression/movement_sparsity/importance_threshold": -0.04840110285271482, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13749028742313385, + "epoch": 7.34, + "learning_rate": 1.7635248108681248e-06, + "loss": 0.168, + "step": 20320, + "task_loss": 0.5608773231506348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06146118848800241, + "compression/movement_sparsity/importance_threshold": -0.048203842793565155, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1740993857383728, + "epoch": 7.35, + "learning_rate": 1.762301439423175e-06, + "loss": 0.1668, + "step": 20330, + "task_loss": 0.5474882125854492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061475630633848616, + "compression/movement_sparsity/importance_threshold": -0.04800711942313074, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17211446166038513, + "epoch": 7.35, + "learning_rate": 1.7610753383372007e-06, + "loss": 0.1616, + "step": 20340, + "task_loss": 0.5157222747802734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06149003343318371, + "compression/movement_sparsity/importance_threshold": -0.04781093201032771, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17125245928764343, + "epoch": 7.35, + "learning_rate": 1.7598465120006126e-06, + "loss": 0.1605, + "step": 20350, + "task_loss": 0.32079434394836426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06150439693967911, + "compression/movement_sparsity/importance_threshold": -0.04761527982407221, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15011651813983917, + "epoch": 7.36, + "learning_rate": 1.7586149648135792e-06, + "loss": 0.1772, + "step": 20360, + "task_loss": 0.3235490918159485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06151872120700623, + "compression/movement_sparsity/importance_threshold": -0.04742016213328004, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1297205686569214, + "epoch": 7.36, + "learning_rate": 1.7573807011860113e-06, + "loss": 0.1701, + "step": 20370, + "task_loss": 0.36008718609809875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061533006288836484, + "compression/movement_sparsity/importance_threshold": -0.04722557820686746, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15162158012390137, + "epoch": 7.37, + "learning_rate": 1.7561437255375478e-06, + "loss": 0.1748, + "step": 20380, + "task_loss": 0.6992220282554626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06154725223884127, + "compression/movement_sparsity/importance_threshold": -0.04703152731375049, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12533384561538696, + "epoch": 7.37, + "learning_rate": 1.7549040422975377e-06, + "loss": 0.1749, + "step": 20390, + "task_loss": 0.4742255210876465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061561459110691996, + "compression/movement_sparsity/importance_threshold": -0.04683800872284527, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15761375427246094, + "epoch": 7.37, + "learning_rate": 1.7536616559050254e-06, + "loss": 0.1763, + "step": 20400, + "task_loss": 0.5680642127990723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0615756269580601, + "compression/movement_sparsity/importance_threshold": -0.046645021703067724, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11964351683855057, + "epoch": 7.38, + "learning_rate": 1.7524165708087364e-06, + "loss": 0.1665, + "step": 20410, + "task_loss": 0.1860896497964859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061589755834616955, + "compression/movement_sparsity/importance_threshold": -0.04645256552333399, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12572947144508362, + "epoch": 7.38, + "learning_rate": 1.7511687914670574e-06, + "loss": 0.1527, + "step": 20420, + "task_loss": 0.4305400550365448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061603845794034, + "compression/movement_sparsity/importance_threshold": -0.046260639452560204, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12916436791419983, + "epoch": 7.38, + "learning_rate": 1.7499183223480233e-06, + "loss": 0.1843, + "step": 20430, + "task_loss": 0.461367130279541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061617896889982635, + "compression/movement_sparsity/importance_threshold": -0.04606924275966229, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1794300675392151, + "epoch": 7.39, + "learning_rate": 1.7486651679293021e-06, + "loss": 0.1691, + "step": 20440, + "task_loss": 0.4559490382671356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06163190917613426, + "compression/movement_sparsity/importance_threshold": -0.0458783747135566, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16324125230312347, + "epoch": 7.39, + "learning_rate": 1.7474093326981751e-06, + "loss": 0.166, + "step": 20450, + "task_loss": 0.5274643898010254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06164588270616032, + "compression/movement_sparsity/importance_threshold": -0.04568803458315873, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1191713958978653, + "epoch": 7.39, + "learning_rate": 1.7461508211515242e-06, + "loss": 0.1474, + "step": 20460, + "task_loss": 0.48396289348602295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06165981753373218, + "compression/movement_sparsity/importance_threshold": -0.04549822163738515, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12512516975402832, + "epoch": 7.4, + "learning_rate": 1.7448896377958144e-06, + "loss": 0.1547, + "step": 20470, + "task_loss": 0.33188262581825256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06167371371252128, + "compression/movement_sparsity/importance_threshold": -0.045308935145151774, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13970869779586792, + "epoch": 7.4, + "learning_rate": 1.743625787147078e-06, + "loss": 0.1544, + "step": 20480, + "task_loss": 0.5282965302467346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06168757129619902, + "compression/movement_sparsity/importance_threshold": -0.045120174375374744, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1316584199666977, + "epoch": 7.41, + "learning_rate": 1.7423592737308973e-06, + "loss": 0.1702, + "step": 20490, + "task_loss": 0.2873547673225403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06170139033843682, + "compression/movement_sparsity/importance_threshold": -0.04493193859696998, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16573722660541534, + "epoch": 7.41, + "learning_rate": 1.7410901020823918e-06, + "loss": 0.1697, + "step": 20500, + "task_loss": 0.39910686016082764 + }, + { + "epoch": 7.41, + "eval_exact_match": 83.71807000946073, + "eval_f1": 90.07741181805484, + "step": 20500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06171517089290609, + "compression/movement_sparsity/importance_threshold": -0.04474422707885373, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13655081391334534, + "epoch": 7.41, + "learning_rate": 1.7398182767461971e-06, + "loss": 0.1551, + "step": 20510, + "task_loss": 0.2691548466682434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06172891301327822, + "compression/movement_sparsity/importance_threshold": -0.04455703908994191, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16987422108650208, + "epoch": 7.42, + "learning_rate": 1.7385438022764523e-06, + "loss": 0.1721, + "step": 20520, + "task_loss": 0.4175964593887329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06174261675322465, + "compression/movement_sparsity/importance_threshold": -0.04437037389915055, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13351169228553772, + "epoch": 7.42, + "learning_rate": 1.7372666832367822e-06, + "loss": 0.1573, + "step": 20530, + "task_loss": 0.46691519021987915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06175628216641678, + "compression/movement_sparsity/importance_threshold": -0.044184230775395905, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16343604028224945, + "epoch": 7.42, + "learning_rate": 1.7359869242002813e-06, + "loss": 0.1737, + "step": 20540, + "task_loss": 0.5338377356529236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06176990930652601, + "compression/movement_sparsity/importance_threshold": -0.04399860898759389, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13753941655158997, + "epoch": 7.43, + "learning_rate": 1.7347045297494976e-06, + "loss": 0.1782, + "step": 20550, + "task_loss": 0.4684370756149292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06178349822722375, + "compression/movement_sparsity/importance_threshold": -0.043813507804660756, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13331389427185059, + "epoch": 7.43, + "learning_rate": 1.7334195044764152e-06, + "loss": 0.1693, + "step": 20560, + "task_loss": 0.42766016721725464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061797048982181434, + "compression/movement_sparsity/importance_threshold": -0.04362892649551231, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15660671889781952, + "epoch": 7.43, + "learning_rate": 1.73213185298244e-06, + "loss": 0.1692, + "step": 20570, + "task_loss": 0.4442064166069031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06181056162507043, + "compression/movement_sparsity/importance_threshold": -0.043444864329064914, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1403474658727646, + "epoch": 7.44, + "learning_rate": 1.7308415798783801e-06, + "loss": 0.166, + "step": 20580, + "task_loss": 0.44031822681427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0618240362095622, + "compression/movement_sparsity/importance_threshold": -0.043261320574234374, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14684978127479553, + "epoch": 7.44, + "learning_rate": 1.7295486897844326e-06, + "loss": 0.162, + "step": 20590, + "task_loss": 0.3407401144504547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06183747278932813, + "compression/movement_sparsity/importance_threshold": -0.04307829449993672, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13846829533576965, + "epoch": 7.44, + "learning_rate": 1.7282531873301647e-06, + "loss": 0.1647, + "step": 20600, + "task_loss": 0.3741005063056946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06185087141803962, + "compression/movement_sparsity/importance_threshold": -0.042895785375088424, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15826284885406494, + "epoch": 7.45, + "learning_rate": 1.7269550771544977e-06, + "loss": 0.1675, + "step": 20610, + "task_loss": 0.5317988991737366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0618642321493681, + "compression/movement_sparsity/importance_threshold": -0.04271379246860507, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20660480856895447, + "epoch": 7.45, + "learning_rate": 1.7256543639056912e-06, + "loss": 0.1728, + "step": 20620, + "task_loss": 0.6406430006027222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06187755503698497, + "compression/movement_sparsity/importance_threshold": -0.04253231504940291, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19647446274757385, + "epoch": 7.46, + "learning_rate": 1.7243510522413259e-06, + "loss": 0.1688, + "step": 20630, + "task_loss": 0.5294222831726074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06189084013456164, + "compression/movement_sparsity/importance_threshold": -0.0423513523863982, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1321870982646942, + "epoch": 7.46, + "learning_rate": 1.723045146828286e-06, + "loss": 0.1639, + "step": 20640, + "task_loss": 0.3000491261482239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061904087495769526, + "compression/movement_sparsity/importance_threshold": -0.04217090374850685, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17913860082626343, + "epoch": 7.46, + "learning_rate": 1.7217366523427442e-06, + "loss": 0.1631, + "step": 20650, + "task_loss": 0.6945570707321167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061917297174280024, + "compression/movement_sparsity/importance_threshold": -0.04199096840464489, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17457491159439087, + "epoch": 7.47, + "learning_rate": 1.720425573470144e-06, + "loss": 0.1761, + "step": 20660, + "task_loss": 0.3351823091506958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06193046922376457, + "compression/movement_sparsity/importance_threshold": -0.04181154562372835, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1305508315563202, + "epoch": 7.47, + "learning_rate": 1.7191119149051824e-06, + "loss": 0.1625, + "step": 20670, + "task_loss": 0.36937570571899414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06194360369789456, + "compression/movement_sparsity/importance_threshold": -0.041632634674673374, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17179100215435028, + "epoch": 7.47, + "learning_rate": 1.717795681351795e-06, + "loss": 0.1638, + "step": 20680, + "task_loss": 0.37270528078079224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061956700650341395, + "compression/movement_sparsity/importance_threshold": -0.04145423482639621, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1304541826248169, + "epoch": 7.48, + "learning_rate": 1.716476877523137e-06, + "loss": 0.1696, + "step": 20690, + "task_loss": 0.2637747526168823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06196976013477651, + "compression/movement_sparsity/importance_threshold": -0.04127634534781255, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1455475091934204, + "epoch": 7.48, + "learning_rate": 1.7151555081415668e-06, + "loss": 0.171, + "step": 20700, + "task_loss": 0.5703924894332886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.061982782204871294, + "compression/movement_sparsity/importance_threshold": -0.04109896550783865, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15408097207546234, + "epoch": 7.48, + "learning_rate": 1.7138315779386306e-06, + "loss": 0.1617, + "step": 20710, + "task_loss": 0.49040016531944275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06199576691429716, + "compression/movement_sparsity/importance_threshold": -0.040922094575390644, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15398012101650238, + "epoch": 7.49, + "learning_rate": 1.7125050916550437e-06, + "loss": 0.157, + "step": 20720, + "task_loss": 0.18766553699970245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062008714316725534, + "compression/movement_sparsity/importance_threshold": -0.040745731819384456, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15429307520389557, + "epoch": 7.49, + "learning_rate": 1.711176054040674e-06, + "loss": 0.1717, + "step": 20730, + "task_loss": 0.40597110986709595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062021624465827804, + "compression/movement_sparsity/importance_threshold": -0.040569876508736336, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12975449860095978, + "epoch": 7.5, + "learning_rate": 1.7098444698545262e-06, + "loss": 0.1546, + "step": 20740, + "task_loss": 0.3054530620574951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062034497415275405, + "compression/movement_sparsity/importance_threshold": -0.0403945279123622, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17744165658950806, + "epoch": 7.5, + "learning_rate": 1.7085103438647223e-06, + "loss": 0.1682, + "step": 20750, + "task_loss": 0.42306089401245117 + }, + { + "epoch": 7.5, + "eval_exact_match": 83.519394512772, + "eval_f1": 89.93130104446098, + "step": 20750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06204733321873974, + "compression/movement_sparsity/importance_threshold": -0.040219685299178076, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16971740126609802, + "epoch": 7.5, + "learning_rate": 1.7071736808484873e-06, + "loss": 0.1686, + "step": 20760, + "task_loss": 0.6131412386894226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062060131929892196, + "compression/movement_sparsity/importance_threshold": -0.04004534793810022, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11441051959991455, + "epoch": 7.51, + "learning_rate": 1.70583448559213e-06, + "loss": 0.1578, + "step": 20770, + "task_loss": 0.38000303506851196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06207289360240422, + "compression/movement_sparsity/importance_threshold": -0.03987151509804454, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13867764174938202, + "epoch": 7.51, + "learning_rate": 1.7044927628910259e-06, + "loss": 0.168, + "step": 20780, + "task_loss": 0.37611913681030273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0620856182899472, + "compression/movement_sparsity/importance_threshold": -0.039698186047927186, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18539825081825256, + "epoch": 7.51, + "learning_rate": 1.7031485175496028e-06, + "loss": 0.1753, + "step": 20790, + "task_loss": 0.3999943733215332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06209830604619255, + "compression/movement_sparsity/importance_threshold": -0.03952536005666407, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14076153934001923, + "epoch": 7.52, + "learning_rate": 1.7018017543813196e-06, + "loss": 0.1617, + "step": 20800, + "task_loss": 0.5043025612831116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06211095692481168, + "compression/movement_sparsity/importance_threshold": -0.03935303639317156, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17567318677902222, + "epoch": 7.52, + "learning_rate": 1.7004524782086524e-06, + "loss": 0.1791, + "step": 20810, + "task_loss": 0.38481491804122925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062123570979476, + "compression/movement_sparsity/importance_threshold": -0.039181214326365454, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14990533888339996, + "epoch": 7.52, + "learning_rate": 1.699100693863075e-06, + "loss": 0.1671, + "step": 20820, + "task_loss": 0.4502103328704834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06213614826385694, + "compression/movement_sparsity/importance_threshold": -0.0390098931251619, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.19391518831253052, + "epoch": 7.53, + "learning_rate": 1.6977464061850425e-06, + "loss": 0.1825, + "step": 20830, + "task_loss": 0.501262903213501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06214868883162588, + "compression/movement_sparsity/importance_threshold": -0.03883907205847703, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17029324173927307, + "epoch": 7.53, + "learning_rate": 1.6963896200239738e-06, + "loss": 0.1572, + "step": 20840, + "task_loss": 0.34914785623550415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06216119273645425, + "compression/movement_sparsity/importance_threshold": -0.03866875039522688, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14604946970939636, + "epoch": 7.54, + "learning_rate": 1.6950303402382348e-06, + "loss": 0.1772, + "step": 20850, + "task_loss": 0.4782135486602783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06217366003201345, + "compression/movement_sparsity/importance_threshold": -0.038498927404327477, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15026389062404633, + "epoch": 7.54, + "learning_rate": 1.6936685716951208e-06, + "loss": 0.1653, + "step": 20860, + "task_loss": 0.3965921401977539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0621860907719749, + "compression/movement_sparsity/importance_threshold": -0.03832960235469485, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16776353120803833, + "epoch": 7.54, + "learning_rate": 1.692304319270838e-06, + "loss": 0.1725, + "step": 20870, + "task_loss": 0.6324939131736755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06219848501001001, + "compression/movement_sparsity/importance_threshold": -0.03816077451524502, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14998579025268555, + "epoch": 7.55, + "learning_rate": 1.690937587850487e-06, + "loss": 0.1652, + "step": 20880, + "task_loss": 0.5109502077102661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06221084279979019, + "compression/movement_sparsity/importance_threshold": -0.03799244315489425, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15119841694831848, + "epoch": 7.55, + "learning_rate": 1.6895683823280459e-06, + "loss": 0.1545, + "step": 20890, + "task_loss": 0.3803994059562683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06222316419498684, + "compression/movement_sparsity/importance_threshold": -0.037824607542558564, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14079336822032928, + "epoch": 7.55, + "learning_rate": 1.6881967076063509e-06, + "loss": 0.152, + "step": 20900, + "task_loss": 0.3902343809604645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06223544924927138, + "compression/movement_sparsity/importance_threshold": -0.03765726694715399, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18181651830673218, + "epoch": 7.56, + "learning_rate": 1.6868225685970807e-06, + "loss": 0.1599, + "step": 20910, + "task_loss": 0.41076356172561646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062247698016315224, + "compression/movement_sparsity/importance_threshold": -0.037490420637596444, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1383550614118576, + "epoch": 7.56, + "learning_rate": 1.6854459702207384e-06, + "loss": 0.1609, + "step": 20920, + "task_loss": 0.253571093082428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06225991054978977, + "compression/movement_sparsity/importance_threshold": -0.03732406788280218, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15035834908485413, + "epoch": 7.56, + "learning_rate": 1.6840669174066326e-06, + "loss": 0.1672, + "step": 20930, + "task_loss": 0.48510587215423584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06227208690336644, + "compression/movement_sparsity/importance_threshold": -0.03715820795168734, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.145534485578537, + "epoch": 7.57, + "learning_rate": 1.6826854150928612e-06, + "loss": 0.1726, + "step": 20940, + "task_loss": 0.35285377502441406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06228422713071664, + "compression/movement_sparsity/importance_threshold": -0.036992840113167724, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1519620716571808, + "epoch": 7.57, + "learning_rate": 1.6813014682262937e-06, + "loss": 0.178, + "step": 20950, + "task_loss": 0.32412058115005493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062296331285511784, + "compression/movement_sparsity/importance_threshold": -0.03682796363615959, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1372935175895691, + "epoch": 7.57, + "learning_rate": 1.6799150817625515e-06, + "loss": 0.1757, + "step": 20960, + "task_loss": 0.3011651337146759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06230839942142328, + "compression/movement_sparsity/importance_threshold": -0.03666357778957896, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1638549268245697, + "epoch": 7.58, + "learning_rate": 1.6785262606659937e-06, + "loss": 0.1654, + "step": 20970, + "task_loss": 0.6097940802574158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062320431592122534, + "compression/movement_sparsity/importance_threshold": -0.03649968184234187, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15273943543434143, + "epoch": 7.58, + "learning_rate": 1.6771350099096963e-06, + "loss": 0.1603, + "step": 20980, + "task_loss": 0.48636484146118164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062332427851280965, + "compression/movement_sparsity/importance_threshold": -0.03633627506336434, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15129825472831726, + "epoch": 7.59, + "learning_rate": 1.6757413344754353e-06, + "loss": 0.1675, + "step": 20990, + "task_loss": 0.3011537194252014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06234438825256998, + "compression/movement_sparsity/importance_threshold": -0.03617335672156263, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18178239464759827, + "epoch": 7.59, + "learning_rate": 1.674345239353669e-06, + "loss": 0.1767, + "step": 21000, + "task_loss": 0.3912660479545593 + }, + { + "epoch": 7.59, + "eval_exact_match": 83.68968779564806, + "eval_f1": 90.02347462983266, + "step": 21000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062356312849660984, + "compression/movement_sparsity/importance_threshold": -0.03601092608585277, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1544092297554016, + "epoch": 7.59, + "learning_rate": 1.6729467295435202e-06, + "loss": 0.1677, + "step": 21010, + "task_loss": 0.4685562252998352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0623682016962254, + "compression/movement_sparsity/importance_threshold": -0.035848982425150555, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14076654613018036, + "epoch": 7.6, + "learning_rate": 1.6715458100527587e-06, + "loss": 0.1547, + "step": 21020, + "task_loss": 0.484361469745636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062380054845934625, + "compression/movement_sparsity/importance_threshold": -0.035687525008372356, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14936229586601257, + "epoch": 7.6, + "learning_rate": 1.6701424858977814e-06, + "loss": 0.1765, + "step": 21030, + "task_loss": 0.4672033190727234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06239187235246008, + "compression/movement_sparsity/importance_threshold": -0.03552655310443398, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1623292863368988, + "epoch": 7.6, + "learning_rate": 1.668736762103598e-06, + "loss": 0.1618, + "step": 21040, + "task_loss": 0.41577792167663574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06240365426947318, + "compression/movement_sparsity/importance_threshold": -0.03536606598225167, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14613062143325806, + "epoch": 7.61, + "learning_rate": 1.6673286437038083e-06, + "loss": 0.1754, + "step": 21050, + "task_loss": 0.22772684693336487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06241540065064532, + "compression/movement_sparsity/importance_threshold": -0.035206062910741576, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17443513870239258, + "epoch": 7.61, + "learning_rate": 1.665918135740589e-06, + "loss": 0.1671, + "step": 21060, + "task_loss": 0.37917977571487427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06242711154964792, + "compression/movement_sparsity/importance_threshold": -0.03504654315881961, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1646181046962738, + "epoch": 7.61, + "learning_rate": 1.6645052432646715e-06, + "loss": 0.1775, + "step": 21070, + "task_loss": 0.3403213620185852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06243878702015238, + "compression/movement_sparsity/importance_threshold": -0.0348875059954018, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15902748703956604, + "epoch": 7.62, + "learning_rate": 1.663089971335327e-06, + "loss": 0.157, + "step": 21080, + "task_loss": 0.3937772512435913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06245042711583012, + "compression/movement_sparsity/importance_threshold": -0.034728950689404514, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16536450386047363, + "epoch": 7.62, + "learning_rate": 1.661672325020346e-06, + "loss": 0.1618, + "step": 21090, + "task_loss": 0.3008047938346863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06246203189035256, + "compression/movement_sparsity/importance_threshold": -0.03457087650974333, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16399919986724854, + "epoch": 7.63, + "learning_rate": 1.660252309396022e-06, + "loss": 0.1695, + "step": 21100, + "task_loss": 0.45590633153915405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.062473601397391096, + "compression/movement_sparsity/importance_threshold": -0.034413282725334726, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14936356246471405, + "epoch": 7.63, + "learning_rate": 1.6588299295471316e-06, + "loss": 0.1614, + "step": 21110, + "task_loss": 0.30914896726608276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06248513569061714, + "compression/movement_sparsity/importance_threshold": -0.03425616860509462, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13448117673397064, + "epoch": 7.63, + "learning_rate": 1.6574051905669179e-06, + "loss": 0.1685, + "step": 21120, + "task_loss": 0.5696876049041748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0624966348237021, + "compression/movement_sparsity/importance_threshold": -0.03409953341793903, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15890441834926605, + "epoch": 7.64, + "learning_rate": 1.6559780975570715e-06, + "loss": 0.177, + "step": 21130, + "task_loss": 0.29223620891571045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0625080988503174, + "compression/movement_sparsity/importance_threshold": -0.03394337643278411, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15977126359939575, + "epoch": 7.64, + "learning_rate": 1.6545486556277118e-06, + "loss": 0.1579, + "step": 21140, + "task_loss": 0.42903733253479004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06251952782413445, + "compression/movement_sparsity/importance_threshold": -0.033787696918545884, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12481575459241867, + "epoch": 7.64, + "learning_rate": 1.6531168698973698e-06, + "loss": 0.1673, + "step": 21150, + "task_loss": 0.4769006669521332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06253092179882463, + "compression/movement_sparsity/importance_threshold": -0.03363249414414049, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14009490609169006, + "epoch": 7.65, + "learning_rate": 1.6516827454929691e-06, + "loss": 0.167, + "step": 21160, + "task_loss": 0.47750040888786316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06254228082805939, + "compression/movement_sparsity/importance_threshold": -0.033477767378483736, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13786697387695312, + "epoch": 7.65, + "learning_rate": 1.6502462875498072e-06, + "loss": 0.1607, + "step": 21170, + "task_loss": 0.31475692987442017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06255360496551013, + "compression/movement_sparsity/importance_threshold": -0.033323515890492095, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13212624192237854, + "epoch": 7.65, + "learning_rate": 1.6488075012115372e-06, + "loss": 0.1696, + "step": 21180, + "task_loss": 0.2864471673965454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06256489426484825, + "compression/movement_sparsity/importance_threshold": -0.03316973894908137, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1384587585926056, + "epoch": 7.66, + "learning_rate": 1.6473663916301506e-06, + "loss": 0.1717, + "step": 21190, + "task_loss": 0.23976373672485352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06257614877974516, + "compression/movement_sparsity/importance_threshold": -0.0330164358231676, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1388477087020874, + "epoch": 7.66, + "learning_rate": 1.6459229639659574e-06, + "loss": 0.1524, + "step": 21200, + "task_loss": 0.30393633246421814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06258736856387229, + "compression/movement_sparsity/importance_threshold": -0.03286360578166703, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1598781794309616, + "epoch": 7.67, + "learning_rate": 1.6444772233875686e-06, + "loss": 0.1706, + "step": 21210, + "task_loss": 0.4479847252368927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06259855367090102, + "compression/movement_sparsity/importance_threshold": -0.03271124809349546, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15409858524799347, + "epoch": 7.67, + "learning_rate": 1.6430291750718763e-06, + "loss": 0.1681, + "step": 21220, + "task_loss": 0.6857647895812988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06260970415450279, + "compression/movement_sparsity/importance_threshold": -0.03255936202756937, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.156455859541893, + "epoch": 7.67, + "learning_rate": 1.6415788242040375e-06, + "loss": 0.1702, + "step": 21230, + "task_loss": 0.4796447157859802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06262082006834899, + "compression/movement_sparsity/importance_threshold": -0.03240794685280446, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17359215021133423, + "epoch": 7.68, + "learning_rate": 1.6401261759774529e-06, + "loss": 0.17, + "step": 21240, + "task_loss": 0.6077663898468018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06263190146611104, + "compression/movement_sparsity/importance_threshold": -0.03225700183811686, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15661515295505524, + "epoch": 7.68, + "learning_rate": 1.6386712355937506e-06, + "loss": 0.1798, + "step": 21250, + "task_loss": 0.4524965286254883 + }, + { + "epoch": 7.68, + "eval_exact_match": 83.59508041627247, + "eval_f1": 89.99440399360535, + "step": 21250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06264294840146034, + "compression/movement_sparsity/importance_threshold": -0.03210652625242283, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16243040561676025, + "epoch": 7.68, + "learning_rate": 1.6372140082627653e-06, + "loss": 0.1725, + "step": 21260, + "task_loss": 0.7392611503601074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06265396092806833, + "compression/movement_sparsity/importance_threshold": -0.03195651936463828, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1820927858352661, + "epoch": 7.69, + "learning_rate": 1.6357544992025214e-06, + "loss": 0.1629, + "step": 21270, + "task_loss": 0.7152500152587891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06266493909960638, + "compression/movement_sparsity/importance_threshold": -0.031806980443679245, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1378469318151474, + "epoch": 7.69, + "learning_rate": 1.6342927136392146e-06, + "loss": 0.1635, + "step": 21280, + "task_loss": 0.4715365171432495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06267588296974594, + "compression/movement_sparsity/importance_threshold": -0.03165790875846186, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14740951359272003, + "epoch": 7.69, + "learning_rate": 1.6328286568071903e-06, + "loss": 0.1732, + "step": 21290, + "task_loss": 0.5006458163261414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0626867925921584, + "compression/movement_sparsity/importance_threshold": -0.03150930357790216, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13728134334087372, + "epoch": 7.7, + "learning_rate": 1.6313623339489285e-06, + "loss": 0.16, + "step": 21300, + "task_loss": 0.3875589966773987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06269766802051516, + "compression/movement_sparsity/importance_threshold": -0.03136116417091628, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.24846616387367249, + "epoch": 7.7, + "learning_rate": 1.6298937503150226e-06, + "loss": 0.1714, + "step": 21310, + "task_loss": 0.4736020565032959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06270850930848765, + "compression/movement_sparsity/importance_threshold": -0.031213489806420247, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2179969847202301, + "epoch": 7.71, + "learning_rate": 1.6284229111641613e-06, + "loss": 0.1854, + "step": 21320, + "task_loss": 0.5530951619148254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06271931650974727, + "compression/movement_sparsity/importance_threshold": -0.031066279753330095, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13526931405067444, + "epoch": 7.71, + "learning_rate": 1.6269498217631102e-06, + "loss": 0.1558, + "step": 21330, + "task_loss": 0.42102596163749695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06273008967796542, + "compression/movement_sparsity/importance_threshold": -0.03091953328056196, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14113759994506836, + "epoch": 7.71, + "learning_rate": 1.6254744873866926e-06, + "loss": 0.1727, + "step": 21340, + "task_loss": 0.277980774641037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06274082886681355, + "compression/movement_sparsity/importance_threshold": -0.030773249657031876, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13323111832141876, + "epoch": 7.72, + "learning_rate": 1.6239969133177703e-06, + "loss": 0.182, + "step": 21350, + "task_loss": 0.44536280632019043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06275153412996302, + "compression/movement_sparsity/importance_threshold": -0.030627428151655756, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16452021896839142, + "epoch": 7.72, + "learning_rate": 1.622517104847225e-06, + "loss": 0.1607, + "step": 21360, + "task_loss": 0.38459068536758423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06276220552108529, + "compression/movement_sparsity/importance_threshold": -0.030482068033349963, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16616493463516235, + "epoch": 7.72, + "learning_rate": 1.6210350672739396e-06, + "loss": 0.1731, + "step": 21370, + "task_loss": 0.8876696825027466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06277284309385174, + "compression/movement_sparsity/importance_threshold": -0.030337168571030304, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1460474133491516, + "epoch": 7.73, + "learning_rate": 1.6195508059047782e-06, + "loss": 0.1685, + "step": 21380, + "task_loss": 0.30946826934814453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06278344690193377, + "compression/movement_sparsity/importance_threshold": -0.03019272903361314, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17295223474502563, + "epoch": 7.73, + "learning_rate": 1.6180643260545695e-06, + "loss": 0.1735, + "step": 21390, + "task_loss": 0.4286682903766632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06279401699900283, + "compression/movement_sparsity/importance_threshold": -0.03004874869001417, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1404990553855896, + "epoch": 7.73, + "learning_rate": 1.6165756330460838e-06, + "loss": 0.1617, + "step": 21400, + "task_loss": 0.40137583017349243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0628045534387303, + "compression/movement_sparsity/importance_threshold": -0.029905226809149643, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14242419600486755, + "epoch": 7.74, + "learning_rate": 1.6150847322100181e-06, + "loss": 0.1605, + "step": 21410, + "task_loss": 0.43105363845825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06281505627478759, + "compression/movement_sparsity/importance_threshold": -0.029762162659935698, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15184879302978516, + "epoch": 7.74, + "learning_rate": 1.6135916288849743e-06, + "loss": 0.1734, + "step": 21420, + "task_loss": 0.26615574955940247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06282552556084613, + "compression/movement_sparsity/importance_threshold": -0.029619555511288254, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20159095525741577, + "epoch": 7.74, + "learning_rate": 1.6120963284174414e-06, + "loss": 0.1686, + "step": 21430, + "task_loss": 0.810195803642273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06283596135057731, + "compression/movement_sparsity/importance_threshold": -0.02947740463212356, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13885483145713806, + "epoch": 7.75, + "learning_rate": 1.6105988361617753e-06, + "loss": 0.17, + "step": 21440, + "task_loss": 0.8474996089935303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06284636369765256, + "compression/movement_sparsity/importance_threshold": -0.029335709291357537, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15829253196716309, + "epoch": 7.75, + "learning_rate": 1.609099157480181e-06, + "loss": 0.1593, + "step": 21450, + "task_loss": 0.3659548759460449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06285673265574328, + "compression/movement_sparsity/importance_threshold": -0.02919446875790621, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16529636085033417, + "epoch": 7.76, + "learning_rate": 1.6075972977426924e-06, + "loss": 0.1695, + "step": 21460, + "task_loss": 0.2963300943374634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06286706827852089, + "compression/movement_sparsity/importance_threshold": -0.029053682300685835, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.2159816175699234, + "epoch": 7.76, + "learning_rate": 1.6060932623271524e-06, + "loss": 0.1615, + "step": 21470, + "task_loss": 0.40529030561447144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06287737061965677, + "compression/movement_sparsity/importance_threshold": -0.028913349188612325, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13636060059070587, + "epoch": 7.76, + "learning_rate": 1.6045870566191958e-06, + "loss": 0.153, + "step": 21480, + "task_loss": 0.42834505438804626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06288763973282237, + "compression/movement_sparsity/importance_threshold": -0.028773468690601822, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11330045759677887, + "epoch": 7.77, + "learning_rate": 1.6030786860122283e-06, + "loss": 0.1819, + "step": 21490, + "task_loss": 0.2636592984199524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06289787567168907, + "compression/movement_sparsity/importance_threshold": -0.028634040075570244, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16234830021858215, + "epoch": 7.77, + "learning_rate": 1.6015681559074076e-06, + "loss": 0.1722, + "step": 21500, + "task_loss": 0.5878846049308777 + }, + { + "epoch": 7.77, + "eval_exact_match": 83.43424787133397, + "eval_f1": 89.87035128509326, + "step": 21500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0629080784899283, + "compression/movement_sparsity/importance_threshold": -0.028495062612433952, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1218387633562088, + "epoch": 7.77, + "learning_rate": 1.6000554717136239e-06, + "loss": 0.153, + "step": 21510, + "task_loss": 0.19702383875846863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06291824824121148, + "compression/movement_sparsity/importance_threshold": -0.028356535570108754, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17990869283676147, + "epoch": 7.78, + "learning_rate": 1.5985406388474809e-06, + "loss": 0.1776, + "step": 21520, + "task_loss": 0.416679322719574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06292838497920998, + "compression/movement_sparsity/importance_threshold": -0.02821845821751079, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18338486552238464, + "epoch": 7.78, + "learning_rate": 1.5970236627332766e-06, + "loss": 0.1697, + "step": 21530, + "task_loss": 0.8984547853469849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06293848875759525, + "compression/movement_sparsity/importance_threshold": -0.028080829823556086, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.146068274974823, + "epoch": 7.78, + "learning_rate": 1.595504548802983e-06, + "loss": 0.1673, + "step": 21540, + "task_loss": 0.5259707570075989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06294855963003868, + "compression/movement_sparsity/importance_threshold": -0.027943649657160896, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14311178028583527, + "epoch": 7.79, + "learning_rate": 1.5939833024962272e-06, + "loss": 0.1753, + "step": 21550, + "task_loss": 0.3172750473022461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0629585976502117, + "compression/movement_sparsity/importance_threshold": -0.027806916987241137, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12539856135845184, + "epoch": 7.79, + "learning_rate": 1.5924599292602725e-06, + "loss": 0.1499, + "step": 21560, + "task_loss": 0.5374971628189087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06296860287178568, + "compression/movement_sparsity/importance_threshold": -0.027670631082712727, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1600005328655243, + "epoch": 7.8, + "learning_rate": 1.5909344345499976e-06, + "loss": 0.1781, + "step": 21570, + "task_loss": 0.37696415185928345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06297857534843208, + "compression/movement_sparsity/importance_threshold": -0.027534791212492027, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1379353404045105, + "epoch": 7.8, + "learning_rate": 1.5894068238278782e-06, + "loss": 0.1637, + "step": 21580, + "task_loss": 0.4239872097969055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0629885151338223, + "compression/movement_sparsity/importance_threshold": -0.027399396645494956, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18081137537956238, + "epoch": 7.8, + "learning_rate": 1.5878771025639664e-06, + "loss": 0.1791, + "step": 21590, + "task_loss": 0.387275755405426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06299842228162772, + "compression/movement_sparsity/importance_threshold": -0.02726444665063754, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14757958054542542, + "epoch": 7.81, + "learning_rate": 1.5863452762358725e-06, + "loss": 0.1521, + "step": 21600, + "task_loss": 0.4220387637615204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06300829684551976, + "compression/movement_sparsity/importance_threshold": -0.027129940496836036, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13776156306266785, + "epoch": 7.81, + "learning_rate": 1.584811350328744e-06, + "loss": 0.1666, + "step": 21610, + "task_loss": 0.38100963830947876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06301813887916986, + "compression/movement_sparsity/importance_threshold": -0.026995877453006356, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15775710344314575, + "epoch": 7.81, + "learning_rate": 1.5832753303352466e-06, + "loss": 0.1679, + "step": 21620, + "task_loss": 0.34619224071502686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06302794843624941, + "compression/movement_sparsity/importance_threshold": -0.02686225678806442, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15650063753128052, + "epoch": 7.82, + "learning_rate": 1.5817372217555452e-06, + "loss": 0.1709, + "step": 21630, + "task_loss": 0.2966180741786957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06303772557042982, + "compression/movement_sparsity/importance_threshold": -0.02672907777092659, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.20328043401241302, + "epoch": 7.82, + "learning_rate": 1.5801970300972825e-06, + "loss": 0.1767, + "step": 21640, + "task_loss": 0.6341493725776672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0630474703353825, + "compression/movement_sparsity/importance_threshold": -0.026596339670508784, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16929413378238678, + "epoch": 7.82, + "learning_rate": 1.5786547608755604e-06, + "loss": 0.1635, + "step": 21650, + "task_loss": 0.4473767876625061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06305718278477886, + "compression/movement_sparsity/importance_threshold": -0.02646404175572714, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14825168251991272, + "epoch": 7.83, + "learning_rate": 1.577110419612921e-06, + "loss": 0.1699, + "step": 21660, + "task_loss": 0.4353300929069519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0630668629722903, + "compression/movement_sparsity/importance_threshold": -0.02633218329549769, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12816864252090454, + "epoch": 7.83, + "learning_rate": 1.575564011839325e-06, + "loss": 0.1517, + "step": 21670, + "task_loss": 0.6550402641296387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06307651095158827, + "compression/movement_sparsity/importance_threshold": -0.026200763558736462, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17422786355018616, + "epoch": 7.84, + "learning_rate": 1.574015543092133e-06, + "loss": 0.1812, + "step": 21680, + "task_loss": 0.496726393699646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06308612677634413, + "compression/movement_sparsity/importance_threshold": -0.026069781814359483, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18180793523788452, + "epoch": 7.84, + "learning_rate": 1.5724650189160866e-06, + "loss": 0.1807, + "step": 21690, + "task_loss": 0.5905567407608032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06309571050022933, + "compression/movement_sparsity/importance_threshold": -0.025939237331283005, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14586716890335083, + "epoch": 7.84, + "learning_rate": 1.5709124448632855e-06, + "loss": 0.1862, + "step": 21700, + "task_loss": 0.908523440361023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06310526217691527, + "compression/movement_sparsity/importance_threshold": -0.025809129378422835, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16504916548728943, + "epoch": 7.85, + "learning_rate": 1.5693578264931715e-06, + "loss": 0.1572, + "step": 21710, + "task_loss": 0.3277358114719391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06311478186007335, + "compression/movement_sparsity/importance_threshold": -0.025679457224695224, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16319605708122253, + "epoch": 7.85, + "learning_rate": 1.5678011693725051e-06, + "loss": 0.1624, + "step": 21720, + "task_loss": 0.3537963628768921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06312426960337499, + "compression/movement_sparsity/importance_threshold": -0.02555022013901631, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16419553756713867, + "epoch": 7.85, + "learning_rate": 1.5662424790753482e-06, + "loss": 0.1614, + "step": 21730, + "task_loss": 0.4133787751197815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06313372546049159, + "compression/movement_sparsity/importance_threshold": -0.025421417390302015, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1400185227394104, + "epoch": 7.86, + "learning_rate": 1.5646817611830424e-06, + "loss": 0.1625, + "step": 21740, + "task_loss": 0.32223424315452576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06314314948509457, + "compression/movement_sparsity/importance_threshold": -0.025293048247468364, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15030598640441895, + "epoch": 7.86, + "learning_rate": 1.5631190212841903e-06, + "loss": 0.1713, + "step": 21750, + "task_loss": 0.4918508529663086 + }, + { + "epoch": 7.86, + "eval_exact_match": 83.43424787133397, + "eval_f1": 89.87738840205206, + "step": 21750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06315254173085535, + "compression/movement_sparsity/importance_threshold": -0.025165111979431498, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1533832848072052, + "epoch": 7.86, + "learning_rate": 1.5615542649746348e-06, + "loss": 0.1572, + "step": 21760, + "task_loss": 0.3352183997631073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06316190225144531, + "compression/movement_sparsity/importance_threshold": -0.025037607855107558, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.13758356869220734, + "epoch": 7.87, + "learning_rate": 1.5599874978574383e-06, + "loss": 0.1681, + "step": 21770, + "task_loss": 0.2775072455406189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06317123110053591, + "compression/movement_sparsity/importance_threshold": -0.02491053514341246, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15805354714393616, + "epoch": 7.87, + "learning_rate": 1.558418725542865e-06, + "loss": 0.1715, + "step": 21780, + "task_loss": 0.3538605868816376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0631805283317985, + "compression/movement_sparsity/importance_threshold": -0.024783893113262234, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14023539423942566, + "epoch": 7.87, + "learning_rate": 1.5568479536483574e-06, + "loss": 0.1725, + "step": 21790, + "task_loss": 0.438821017742157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06318979399890454, + "compression/movement_sparsity/importance_threshold": -0.024657681033573242, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1380080133676529, + "epoch": 7.88, + "learning_rate": 1.5552751877985198e-06, + "loss": 0.1598, + "step": 21800, + "task_loss": 0.35417595505714417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06319902815552542, + "compression/movement_sparsity/importance_threshold": -0.02453189817326129, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1379581242799759, + "epoch": 7.88, + "learning_rate": 1.5537004336250953e-06, + "loss": 0.1623, + "step": 21810, + "task_loss": 0.2302461713552475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06320823085533255, + "compression/movement_sparsity/importance_threshold": -0.02440654380124263, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17599256336688995, + "epoch": 7.89, + "learning_rate": 1.5521236967669476e-06, + "loss": 0.1751, + "step": 21820, + "task_loss": 0.4087195098400116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06321740215199734, + "compression/movement_sparsity/importance_threshold": -0.02428161718643307, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15393874049186707, + "epoch": 7.89, + "learning_rate": 1.5505449828700391e-06, + "loss": 0.1707, + "step": 21830, + "task_loss": 0.36279088258743286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0632265420991912, + "compression/movement_sparsity/importance_threshold": -0.024157117597748967, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1479380875825882, + "epoch": 7.89, + "learning_rate": 1.5489642975874122e-06, + "loss": 0.1614, + "step": 21840, + "task_loss": 0.43778854608535767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06323565075058554, + "compression/movement_sparsity/importance_threshold": -0.024033044304106133, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.16105225682258606, + "epoch": 7.9, + "learning_rate": 1.5473816465791684e-06, + "loss": 0.1619, + "step": 21850, + "task_loss": 0.3186939060688019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06324472815985178, + "compression/movement_sparsity/importance_threshold": -0.023909396574420927, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.12178893387317657, + "epoch": 7.9, + "learning_rate": 1.5457970355124478e-06, + "loss": 0.1616, + "step": 21860, + "task_loss": 0.5092835426330566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06325377438066133, + "compression/movement_sparsity/importance_threshold": -0.023786173677609046, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14096394181251526, + "epoch": 7.9, + "learning_rate": 1.5442104700614089e-06, + "loss": 0.173, + "step": 21870, + "task_loss": 0.4207335114479065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06326278946668559, + "compression/movement_sparsity/importance_threshold": -0.02366337488258685, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1833285242319107, + "epoch": 7.91, + "learning_rate": 1.542621955907209e-06, + "loss": 0.1776, + "step": 21880, + "task_loss": 0.35541167855262756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06327177347159599, + "compression/movement_sparsity/importance_threshold": -0.02354099945827015, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1276547908782959, + "epoch": 7.91, + "learning_rate": 1.5410314987379826e-06, + "loss": 0.1608, + "step": 21890, + "task_loss": 0.9517670273780823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06328072644906392, + "compression/movement_sparsity/importance_threshold": -0.023419046673575417, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17998504638671875, + "epoch": 7.91, + "learning_rate": 1.5394391042488227e-06, + "loss": 0.162, + "step": 21900, + "task_loss": 0.39371180534362793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06328964845276079, + "compression/movement_sparsity/importance_threshold": -0.023297515797418344, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15145400166511536, + "epoch": 7.92, + "learning_rate": 1.5378447781417583e-06, + "loss": 0.1493, + "step": 21910, + "task_loss": 0.35905247926712036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06329853953635801, + "compression/movement_sparsity/importance_threshold": -0.023176406098715185, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14053452014923096, + "epoch": 7.92, + "learning_rate": 1.5362485261257357e-06, + "loss": 0.1702, + "step": 21920, + "task_loss": 0.6316653490066528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06330739975352702, + "compression/movement_sparsity/importance_threshold": -0.023055716846381857, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.1474117934703827, + "epoch": 7.93, + "learning_rate": 1.5346503539165975e-06, + "loss": 0.1663, + "step": 21930, + "task_loss": 0.4833589792251587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0633162291579392, + "compression/movement_sparsity/importance_threshold": -0.0229354473093345, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.11505290865898132, + "epoch": 7.93, + "learning_rate": 1.5330502672370624e-06, + "loss": 0.1662, + "step": 21940, + "task_loss": 0.22523199021816254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06332502780326597, + "compression/movement_sparsity/importance_threshold": -0.022815596756489254, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.15433210134506226, + "epoch": 7.93, + "learning_rate": 1.5314482718167034e-06, + "loss": 0.1667, + "step": 21950, + "task_loss": 0.4213239252567291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06333379574317875, + "compression/movement_sparsity/importance_threshold": -0.022696164456762147, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18093153834342957, + "epoch": 7.94, + "learning_rate": 1.5298443733919294e-06, + "loss": 0.1637, + "step": 21960, + "task_loss": 0.34346556663513184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06334253303134893, + "compression/movement_sparsity/importance_threshold": -0.02257714967906932, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.10571896284818649, + "epoch": 7.94, + "learning_rate": 1.5282385777059635e-06, + "loss": 0.1562, + "step": 21970, + "task_loss": 0.17734256386756897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06335123972144793, + "compression/movement_sparsity/importance_threshold": -0.022458551692326578, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.18012669682502747, + "epoch": 7.94, + "learning_rate": 1.526630890508821e-06, + "loss": 0.1653, + "step": 21980, + "task_loss": 0.5104833245277405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06335991586714718, + "compression/movement_sparsity/importance_threshold": -0.022340369765450174, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.17314781248569489, + "epoch": 7.95, + "learning_rate": 1.5250213175572921e-06, + "loss": 0.1676, + "step": 21990, + "task_loss": 0.3881048560142517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06336856152211805, + "compression/movement_sparsity/importance_threshold": -0.022222603167356247, + "compression/movement_sparsity/linear_layer_sparsity": 0.39830344237724996, + "compression/movement_sparsity/model_sparsity": 0.22154907944140886, + "compression_loss": 0.0, + "distillation_loss": 0.14116662740707397, + "epoch": 7.95, + "learning_rate": 1.523409864614919e-06, + "loss": 0.1608, + "step": 22000, + "task_loss": 0.40728896856307983 + }, + { + "epoch": 7.95, + "eval_exact_match": 83.80321665089878, + "eval_f1": 90.15605593670335, + "step": 22000 + } + ], + "max_steps": 49806, + "num_train_epochs": 18, + "total_flos": 1.968581126605824e+16, + "trial_name": null, + "trial_params": null +}