diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,107822 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.999366420274551, + "global_step": 8281, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25816988945007324, + "epoch": 0.0, + "learning_rate": 4.9993962081874176e-05, + "loss": 0.2805, + "step": 1, + "task_loss": 0.5853821039199829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18578635156154633, + "epoch": 0.0, + "learning_rate": 4.998792416374834e-05, + "loss": 0.1863, + "step": 2, + "task_loss": 0.16780924797058105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23223145306110382, + "epoch": 0.0, + "learning_rate": 4.998188624562251e-05, + "loss": 0.4064, + "step": 3, + "task_loss": 0.3836962878704071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25595200061798096, + "epoch": 0.0, + "learning_rate": 4.9975848327496685e-05, + "loss": 0.3625, + "step": 4, + "task_loss": 0.6386852860450745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4139005243778229, + "epoch": 0.0, + "learning_rate": 4.996981040937085e-05, + "loss": 0.389, + "step": 5, + "task_loss": 0.2773173153400421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17063988745212555, + "epoch": 0.01, + "learning_rate": 4.996377249124502e-05, + "loss": 0.3621, + "step": 6, + "task_loss": 0.5356371998786926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26443102955818176, + "epoch": 0.01, + "learning_rate": 4.995773457311919e-05, + "loss": 0.3757, + "step": 7, + "task_loss": 0.3861358165740967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24171142280101776, + "epoch": 0.01, + "learning_rate": 4.995169665499336e-05, + "loss": 0.4133, + "step": 8, + "task_loss": 0.17129471898078918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2744707465171814, + "epoch": 0.01, + "learning_rate": 4.994565873686753e-05, + "loss": 0.413, + "step": 9, + "task_loss": 0.3300344944000244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3388405442237854, + "epoch": 0.01, + "learning_rate": 4.99396208187417e-05, + "loss": 0.2507, + "step": 10, + "task_loss": 0.2072344720363617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.147495299577713, + "epoch": 0.01, + "learning_rate": 4.993358290061587e-05, + "loss": 0.3421, + "step": 11, + "task_loss": 0.3913913369178772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41760873794555664, + "epoch": 0.01, + "learning_rate": 4.9927544982490036e-05, + "loss": 0.3769, + "step": 12, + "task_loss": 0.3634096086025238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4394819140434265, + "epoch": 0.01, + "learning_rate": 4.992150706436421e-05, + "loss": 0.3362, + "step": 13, + "task_loss": 0.6043516397476196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38453197479248047, + "epoch": 0.01, + "learning_rate": 4.9915469146238384e-05, + "loss": 0.2789, + "step": 14, + "task_loss": 0.4792601466178894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3559964597225189, + "epoch": 0.01, + "learning_rate": 4.9909431228112544e-05, + "loss": 0.4384, + "step": 15, + "task_loss": 0.49861496686935425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2305917739868164, + "epoch": 0.01, + "learning_rate": 4.990339330998672e-05, + "loss": 0.2783, + "step": 16, + "task_loss": 0.7065786123275757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4214351773262024, + "epoch": 0.01, + "learning_rate": 4.989735539186089e-05, + "loss": 0.3208, + "step": 17, + "task_loss": 1.549605369567871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49794405698776245, + "epoch": 0.02, + "learning_rate": 4.989131747373506e-05, + "loss": 0.5574, + "step": 18, + "task_loss": 0.6901516914367676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22809137403964996, + "epoch": 0.02, + "learning_rate": 4.9885279555609226e-05, + "loss": 0.274, + "step": 19, + "task_loss": 1.07197904586792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49502134323120117, + "epoch": 0.02, + "learning_rate": 4.98792416374834e-05, + "loss": 0.5689, + "step": 20, + "task_loss": 1.110392689704895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2524433732032776, + "epoch": 0.02, + "learning_rate": 4.987320371935757e-05, + "loss": 0.3597, + "step": 21, + "task_loss": 0.804271936416626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5214238166809082, + "epoch": 0.02, + "learning_rate": 4.9867165801231735e-05, + "loss": 0.3759, + "step": 22, + "task_loss": 0.6734863519668579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1858106255531311, + "epoch": 0.02, + "learning_rate": 4.986112788310591e-05, + "loss": 0.3365, + "step": 23, + "task_loss": 0.6390823125839233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2975519299507141, + "epoch": 0.02, + "learning_rate": 4.9855089964980076e-05, + "loss": 0.3212, + "step": 24, + "task_loss": 0.1450793743133545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3863736391067505, + "epoch": 0.02, + "learning_rate": 4.984905204685424e-05, + "loss": 0.4371, + "step": 25, + "task_loss": 0.8929804563522339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31156808137893677, + "epoch": 0.02, + "learning_rate": 4.984301412872842e-05, + "loss": 0.5147, + "step": 26, + "task_loss": 0.7455569505691528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32773831486701965, + "epoch": 0.02, + "learning_rate": 4.9836976210602584e-05, + "loss": 0.3446, + "step": 27, + "task_loss": 1.1415926218032837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17569687962532043, + "epoch": 0.02, + "learning_rate": 4.983093829247676e-05, + "loss": 0.2192, + "step": 28, + "task_loss": 0.20533543825149536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3167559802532196, + "epoch": 0.02, + "learning_rate": 4.9824900374350925e-05, + "loss": 0.3656, + "step": 29, + "task_loss": 1.0243937969207764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4247322380542755, + "epoch": 0.03, + "learning_rate": 4.98188624562251e-05, + "loss": 0.3238, + "step": 30, + "task_loss": 0.589907705783844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4397602081298828, + "epoch": 0.03, + "learning_rate": 4.9812824538099266e-05, + "loss": 0.3644, + "step": 31, + "task_loss": 0.5212509632110596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23409625887870789, + "epoch": 0.03, + "learning_rate": 4.9806786619973434e-05, + "loss": 0.333, + "step": 32, + "task_loss": 0.7313840985298157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12164507061243057, + "epoch": 0.03, + "learning_rate": 4.980074870184761e-05, + "loss": 0.4048, + "step": 33, + "task_loss": 0.7823198437690735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6909140348434448, + "epoch": 0.03, + "learning_rate": 4.9794710783721775e-05, + "loss": 0.4481, + "step": 34, + "task_loss": 0.6988318562507629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3984873294830322, + "epoch": 0.03, + "learning_rate": 4.978867286559594e-05, + "loss": 0.4058, + "step": 35, + "task_loss": 0.48753196001052856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.13683383166790009, + "epoch": 0.03, + "learning_rate": 4.9782634947470116e-05, + "loss": 0.3467, + "step": 36, + "task_loss": 0.29020625352859497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2913225293159485, + "epoch": 0.03, + "learning_rate": 4.977659702934428e-05, + "loss": 0.4073, + "step": 37, + "task_loss": 0.7701352834701538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44909706711769104, + "epoch": 0.03, + "learning_rate": 4.977055911121846e-05, + "loss": 0.3958, + "step": 38, + "task_loss": 0.46714961528778076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37656569480895996, + "epoch": 0.03, + "learning_rate": 4.9764521193092624e-05, + "loss": 0.3124, + "step": 39, + "task_loss": 0.5885592699050903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2543381452560425, + "epoch": 0.03, + "learning_rate": 4.975848327496679e-05, + "loss": 0.3764, + "step": 40, + "task_loss": 1.316689372062683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5124260187149048, + "epoch": 0.03, + "learning_rate": 4.9752445356840965e-05, + "loss": 0.418, + "step": 41, + "task_loss": 0.5791172385215759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25226742029190063, + "epoch": 0.04, + "learning_rate": 4.974640743871513e-05, + "loss": 0.2663, + "step": 42, + "task_loss": 0.38406485319137573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34966787695884705, + "epoch": 0.04, + "learning_rate": 4.97403695205893e-05, + "loss": 0.4449, + "step": 43, + "task_loss": 1.1614713668823242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23528718948364258, + "epoch": 0.04, + "learning_rate": 4.9734331602463474e-05, + "loss": 0.3279, + "step": 44, + "task_loss": 0.4693385064601898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6375081539154053, + "epoch": 0.04, + "learning_rate": 4.972829368433764e-05, + "loss": 0.4547, + "step": 45, + "task_loss": 0.8986639976501465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5959746837615967, + "epoch": 0.04, + "learning_rate": 4.9722255766211815e-05, + "loss": 0.4487, + "step": 46, + "task_loss": 0.7575843930244446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3843839168548584, + "epoch": 0.04, + "learning_rate": 4.971621784808598e-05, + "loss": 0.4116, + "step": 47, + "task_loss": 0.8450673818588257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2406119555234909, + "epoch": 0.04, + "learning_rate": 4.9710179929960156e-05, + "loss": 0.3084, + "step": 48, + "task_loss": 0.16761846840381622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35507768392562866, + "epoch": 0.04, + "learning_rate": 4.970414201183432e-05, + "loss": 0.3447, + "step": 49, + "task_loss": 0.42451998591423035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21716907620429993, + "epoch": 0.04, + "learning_rate": 4.969810409370849e-05, + "loss": 0.2907, + "step": 50, + "task_loss": 0.3825681507587433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30666980147361755, + "epoch": 0.04, + "learning_rate": 4.9692066175582664e-05, + "loss": 0.3688, + "step": 51, + "task_loss": 0.5329431295394897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22709280252456665, + "epoch": 0.04, + "learning_rate": 4.968602825745683e-05, + "loss": 0.2923, + "step": 52, + "task_loss": 0.24181126058101654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.287906676530838, + "epoch": 0.04, + "learning_rate": 4.9679990339331e-05, + "loss": 0.3116, + "step": 53, + "task_loss": 0.24539609253406525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20339882373809814, + "epoch": 0.05, + "learning_rate": 4.967395242120517e-05, + "loss": 0.3187, + "step": 54, + "task_loss": 0.5900417566299438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2680729627609253, + "epoch": 0.05, + "learning_rate": 4.966791450307934e-05, + "loss": 0.3322, + "step": 55, + "task_loss": 1.2729666233062744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2620672583580017, + "epoch": 0.05, + "learning_rate": 4.966187658495351e-05, + "loss": 0.3288, + "step": 56, + "task_loss": 0.7690536975860596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2558075189590454, + "epoch": 0.05, + "learning_rate": 4.965583866682768e-05, + "loss": 0.2878, + "step": 57, + "task_loss": 0.45123398303985596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47659721970558167, + "epoch": 0.05, + "learning_rate": 4.9649800748701855e-05, + "loss": 0.4099, + "step": 58, + "task_loss": 0.394521027803421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19426442682743073, + "epoch": 0.05, + "learning_rate": 4.9643762830576015e-05, + "loss": 0.2888, + "step": 59, + "task_loss": 0.2497028261423111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2723541259765625, + "epoch": 0.05, + "learning_rate": 4.963772491245019e-05, + "loss": 0.2959, + "step": 60, + "task_loss": 0.3907150626182556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5044282674789429, + "epoch": 0.05, + "learning_rate": 4.963168699432436e-05, + "loss": 0.391, + "step": 61, + "task_loss": 0.6892834901809692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3972747027873993, + "epoch": 0.05, + "learning_rate": 4.962564907619853e-05, + "loss": 0.3642, + "step": 62, + "task_loss": 1.03528892993927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44778820872306824, + "epoch": 0.05, + "learning_rate": 4.96196111580727e-05, + "loss": 0.313, + "step": 63, + "task_loss": 0.6543607711791992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4266464114189148, + "epoch": 0.05, + "learning_rate": 4.961357323994687e-05, + "loss": 0.4076, + "step": 64, + "task_loss": 0.6509642004966736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1449139416217804, + "epoch": 0.05, + "learning_rate": 4.960753532182104e-05, + "loss": 0.3039, + "step": 65, + "task_loss": 0.38982975482940674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3253495693206787, + "epoch": 0.06, + "learning_rate": 4.9601497403695206e-05, + "loss": 0.37, + "step": 66, + "task_loss": 0.7798401117324829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35335421562194824, + "epoch": 0.06, + "learning_rate": 4.959545948556938e-05, + "loss": 0.3082, + "step": 67, + "task_loss": 0.7975855469703674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4480602741241455, + "epoch": 0.06, + "learning_rate": 4.958942156744355e-05, + "loss": 0.3148, + "step": 68, + "task_loss": 0.18220213055610657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3367661237716675, + "epoch": 0.06, + "learning_rate": 4.9583383649317714e-05, + "loss": 0.2575, + "step": 69, + "task_loss": 0.6136961579322815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5279454588890076, + "epoch": 0.06, + "learning_rate": 4.957734573119189e-05, + "loss": 0.4337, + "step": 70, + "task_loss": 0.7228941917419434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33608484268188477, + "epoch": 0.06, + "learning_rate": 4.957130781306606e-05, + "loss": 0.3909, + "step": 71, + "task_loss": 1.099329948425293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29355472326278687, + "epoch": 0.06, + "learning_rate": 4.956526989494022e-05, + "loss": 0.3845, + "step": 72, + "task_loss": 0.5266224145889282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31957435607910156, + "epoch": 0.06, + "learning_rate": 4.9559231976814396e-05, + "loss": 0.3339, + "step": 73, + "task_loss": 0.43144696950912476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4300149977207184, + "epoch": 0.06, + "learning_rate": 4.955319405868857e-05, + "loss": 0.3992, + "step": 74, + "task_loss": 0.570293664932251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8164573907852173, + "epoch": 0.06, + "learning_rate": 4.954715614056273e-05, + "loss": 0.4478, + "step": 75, + "task_loss": 1.3061954975128174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2138664424419403, + "epoch": 0.06, + "learning_rate": 4.9541118222436905e-05, + "loss": 0.5357, + "step": 76, + "task_loss": 0.4738396406173706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20729652047157288, + "epoch": 0.07, + "learning_rate": 4.953508030431108e-05, + "loss": 0.2988, + "step": 77, + "task_loss": 0.1066388487815857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3723217248916626, + "epoch": 0.07, + "learning_rate": 4.9529042386185246e-05, + "loss": 0.3181, + "step": 78, + "task_loss": 0.7636824250221252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29050979018211365, + "epoch": 0.07, + "learning_rate": 4.952300446805941e-05, + "loss": 0.3519, + "step": 79, + "task_loss": 0.48806482553482056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5898647904396057, + "epoch": 0.07, + "learning_rate": 4.951696654993359e-05, + "loss": 0.3993, + "step": 80, + "task_loss": 0.43252667784690857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34061485528945923, + "epoch": 0.07, + "learning_rate": 4.9510928631807754e-05, + "loss": 0.3443, + "step": 81, + "task_loss": 1.4975894689559937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24292877316474915, + "epoch": 0.07, + "learning_rate": 4.950489071368192e-05, + "loss": 0.263, + "step": 82, + "task_loss": 0.5394057631492615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28718599677085876, + "epoch": 0.07, + "learning_rate": 4.9498852795556095e-05, + "loss": 0.2962, + "step": 83, + "task_loss": 0.22143614292144775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3874911069869995, + "epoch": 0.07, + "learning_rate": 4.949281487743026e-05, + "loss": 0.3169, + "step": 84, + "task_loss": 0.4732312560081482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3567826747894287, + "epoch": 0.07, + "learning_rate": 4.948677695930443e-05, + "loss": 0.3102, + "step": 85, + "task_loss": 1.1696805953979492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25368592143058777, + "epoch": 0.07, + "learning_rate": 4.9480739041178604e-05, + "loss": 0.4811, + "step": 86, + "task_loss": 0.19513137638568878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3893371522426605, + "epoch": 0.07, + "learning_rate": 4.947470112305278e-05, + "loss": 0.4123, + "step": 87, + "task_loss": 0.8287002444267273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27553173899650574, + "epoch": 0.07, + "learning_rate": 4.9468663204926945e-05, + "loss": 0.2527, + "step": 88, + "task_loss": 0.4779343008995056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2027340829372406, + "epoch": 0.08, + "learning_rate": 4.946262528680111e-05, + "loss": 0.2572, + "step": 89, + "task_loss": 0.2104167938232422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22549833357334137, + "epoch": 0.08, + "learning_rate": 4.9456587368675286e-05, + "loss": 0.324, + "step": 90, + "task_loss": 0.4265645742416382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4884544014930725, + "epoch": 0.08, + "learning_rate": 4.945054945054945e-05, + "loss": 0.3642, + "step": 91, + "task_loss": 0.34880319237709045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6234738230705261, + "epoch": 0.08, + "learning_rate": 4.944451153242362e-05, + "loss": 0.4123, + "step": 92, + "task_loss": 0.6439784169197083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2962171137332916, + "epoch": 0.08, + "learning_rate": 4.9438473614297794e-05, + "loss": 0.4553, + "step": 93, + "task_loss": 0.9175675511360168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3969919681549072, + "epoch": 0.08, + "learning_rate": 4.943243569617196e-05, + "loss": 0.3523, + "step": 94, + "task_loss": 0.32375988364219666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5437160730361938, + "epoch": 0.08, + "learning_rate": 4.942639777804613e-05, + "loss": 0.469, + "step": 95, + "task_loss": 0.8413633108139038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34137535095214844, + "epoch": 0.08, + "learning_rate": 4.94203598599203e-05, + "loss": 0.4128, + "step": 96, + "task_loss": 1.7086763381958008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2708493769168854, + "epoch": 0.08, + "learning_rate": 4.941432194179447e-05, + "loss": 0.342, + "step": 97, + "task_loss": 0.5729495286941528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3845900297164917, + "epoch": 0.08, + "learning_rate": 4.9408284023668644e-05, + "loss": 0.3459, + "step": 98, + "task_loss": 0.640835165977478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17166396975517273, + "epoch": 0.08, + "learning_rate": 4.940224610554281e-05, + "loss": 0.2638, + "step": 99, + "task_loss": 0.4222128987312317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5104278922080994, + "epoch": 0.08, + "learning_rate": 4.939620818741698e-05, + "loss": 0.4602, + "step": 100, + "task_loss": 1.2593507766723633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3089989423751831, + "epoch": 0.09, + "learning_rate": 4.939017026929115e-05, + "loss": 0.3225, + "step": 101, + "task_loss": 0.26729482412338257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43301767110824585, + "epoch": 0.09, + "learning_rate": 4.938413235116532e-05, + "loss": 0.3929, + "step": 102, + "task_loss": 1.302262783050537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4287503659725189, + "epoch": 0.09, + "learning_rate": 4.937809443303949e-05, + "loss": 0.485, + "step": 103, + "task_loss": 0.9651143550872803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5477359890937805, + "epoch": 0.09, + "learning_rate": 4.937205651491366e-05, + "loss": 0.4382, + "step": 104, + "task_loss": 1.432396650314331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5919656157493591, + "epoch": 0.09, + "learning_rate": 4.936601859678783e-05, + "loss": 0.343, + "step": 105, + "task_loss": 0.8800464272499084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24281111359596252, + "epoch": 0.09, + "learning_rate": 4.9359980678662e-05, + "loss": 0.3865, + "step": 106, + "task_loss": 0.5352722406387329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2820776402950287, + "epoch": 0.09, + "learning_rate": 4.935394276053617e-05, + "loss": 0.4199, + "step": 107, + "task_loss": 0.9807997345924377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22061243653297424, + "epoch": 0.09, + "learning_rate": 4.934790484241034e-05, + "loss": 0.3277, + "step": 108, + "task_loss": 0.15575382113456726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4099520742893219, + "epoch": 0.09, + "learning_rate": 4.934186692428451e-05, + "loss": 0.3114, + "step": 109, + "task_loss": 0.48929744958877563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24980920553207397, + "epoch": 0.09, + "learning_rate": 4.933582900615868e-05, + "loss": 0.3978, + "step": 110, + "task_loss": 0.2342258244752884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35572949051856995, + "epoch": 0.09, + "learning_rate": 4.932979108803285e-05, + "loss": 0.4319, + "step": 111, + "task_loss": 0.882225513458252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20163123309612274, + "epoch": 0.09, + "learning_rate": 4.932375316990702e-05, + "loss": 0.3542, + "step": 112, + "task_loss": 0.024291040375828743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33488476276397705, + "epoch": 0.1, + "learning_rate": 4.9317715251781185e-05, + "loss": 0.2929, + "step": 113, + "task_loss": 0.6396358013153076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23511746525764465, + "epoch": 0.1, + "learning_rate": 4.931167733365536e-05, + "loss": 0.3756, + "step": 114, + "task_loss": 0.3878803253173828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47338762879371643, + "epoch": 0.1, + "learning_rate": 4.9305639415529527e-05, + "loss": 0.3755, + "step": 115, + "task_loss": 0.3775898814201355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.338032603263855, + "epoch": 0.1, + "learning_rate": 4.9299601497403694e-05, + "loss": 0.4114, + "step": 116, + "task_loss": 0.4424063563346863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6006137728691101, + "epoch": 0.1, + "learning_rate": 4.929356357927787e-05, + "loss": 0.441, + "step": 117, + "task_loss": 0.7305097579956055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5968002080917358, + "epoch": 0.1, + "learning_rate": 4.928752566115204e-05, + "loss": 0.4182, + "step": 118, + "task_loss": 0.6539137959480286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2060955911874771, + "epoch": 0.1, + "learning_rate": 4.928148774302621e-05, + "loss": 0.4359, + "step": 119, + "task_loss": 0.23030970990657806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41826102137565613, + "epoch": 0.1, + "learning_rate": 4.9275449824900376e-05, + "loss": 0.5222, + "step": 120, + "task_loss": 1.048397183418274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45675361156463623, + "epoch": 0.1, + "learning_rate": 4.926941190677455e-05, + "loss": 0.4799, + "step": 121, + "task_loss": 0.18881812691688538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1543813943862915, + "epoch": 0.1, + "learning_rate": 4.926337398864872e-05, + "loss": 0.2702, + "step": 122, + "task_loss": 0.5565088391304016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22674205899238586, + "epoch": 0.1, + "learning_rate": 4.9257336070522884e-05, + "loss": 0.2697, + "step": 123, + "task_loss": 0.8856381773948669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44395679235458374, + "epoch": 0.1, + "learning_rate": 4.925129815239706e-05, + "loss": 0.3386, + "step": 124, + "task_loss": 0.6910078525543213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6182366609573364, + "epoch": 0.11, + "learning_rate": 4.9245260234271226e-05, + "loss": 0.3977, + "step": 125, + "task_loss": 0.36996322870254517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3325819969177246, + "epoch": 0.11, + "learning_rate": 4.923922231614539e-05, + "loss": 0.3033, + "step": 126, + "task_loss": 1.2850160598754883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38252389430999756, + "epoch": 0.11, + "learning_rate": 4.923318439801957e-05, + "loss": 0.3852, + "step": 127, + "task_loss": 0.33823931217193604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3019288182258606, + "epoch": 0.11, + "learning_rate": 4.922714647989374e-05, + "loss": 0.4403, + "step": 128, + "task_loss": 2.0719704627990723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3690018951892853, + "epoch": 0.11, + "learning_rate": 4.92211085617679e-05, + "loss": 0.4573, + "step": 129, + "task_loss": 1.5507882833480835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4736906886100769, + "epoch": 0.11, + "learning_rate": 4.9215070643642075e-05, + "loss": 0.2867, + "step": 130, + "task_loss": 0.25044187903404236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30232638120651245, + "epoch": 0.11, + "learning_rate": 4.920903272551625e-05, + "loss": 0.3516, + "step": 131, + "task_loss": 1.6450247764587402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1854860931634903, + "epoch": 0.11, + "learning_rate": 4.920299480739041e-05, + "loss": 0.3058, + "step": 132, + "task_loss": 0.06923609972000122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.407135933637619, + "epoch": 0.11, + "learning_rate": 4.919695688926458e-05, + "loss": 0.4332, + "step": 133, + "task_loss": 1.0303617715835571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37842056155204773, + "epoch": 0.11, + "learning_rate": 4.919091897113876e-05, + "loss": 0.4169, + "step": 134, + "task_loss": 1.5742859840393066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.11493265628814697, + "epoch": 0.11, + "learning_rate": 4.9184881053012924e-05, + "loss": 0.2742, + "step": 135, + "task_loss": 0.3966679275035858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5346572995185852, + "epoch": 0.11, + "learning_rate": 4.917884313488709e-05, + "loss": 0.4029, + "step": 136, + "task_loss": 0.9560654163360596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24311745166778564, + "epoch": 0.12, + "learning_rate": 4.9172805216761266e-05, + "loss": 0.4293, + "step": 137, + "task_loss": 0.11503170430660248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23056933283805847, + "epoch": 0.12, + "learning_rate": 4.916676729863543e-05, + "loss": 0.3317, + "step": 138, + "task_loss": 0.5454956889152527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2211146354675293, + "epoch": 0.12, + "learning_rate": 4.91607293805096e-05, + "loss": 0.2856, + "step": 139, + "task_loss": 0.07383111864328384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40937942266464233, + "epoch": 0.12, + "learning_rate": 4.9154691462383774e-05, + "loss": 0.3256, + "step": 140, + "task_loss": 0.9068475365638733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23422488570213318, + "epoch": 0.12, + "learning_rate": 4.914865354425794e-05, + "loss": 0.3743, + "step": 141, + "task_loss": 0.772557258605957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3856215476989746, + "epoch": 0.12, + "learning_rate": 4.914261562613211e-05, + "loss": 0.3121, + "step": 142, + "task_loss": 1.003980278968811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2954632639884949, + "epoch": 0.12, + "learning_rate": 4.913657770800628e-05, + "loss": 0.2943, + "step": 143, + "task_loss": 1.9202251434326172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17213040590286255, + "epoch": 0.12, + "learning_rate": 4.9130539789880456e-05, + "loss": 0.2983, + "step": 144, + "task_loss": 1.1155539751052856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33316195011138916, + "epoch": 0.12, + "learning_rate": 4.912450187175462e-05, + "loss": 0.3475, + "step": 145, + "task_loss": 0.5890751481056213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43856483697891235, + "epoch": 0.12, + "learning_rate": 4.911846395362879e-05, + "loss": 0.3749, + "step": 146, + "task_loss": 1.2088651657104492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3035190999507904, + "epoch": 0.12, + "learning_rate": 4.9112426035502965e-05, + "loss": 0.3782, + "step": 147, + "task_loss": 0.5619529485702515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29764872789382935, + "epoch": 0.13, + "learning_rate": 4.910638811737713e-05, + "loss": 0.3253, + "step": 148, + "task_loss": 0.8805117011070251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22491507232189178, + "epoch": 0.13, + "learning_rate": 4.91003501992513e-05, + "loss": 0.4551, + "step": 149, + "task_loss": 0.2589697241783142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4931153357028961, + "epoch": 0.13, + "learning_rate": 4.909431228112547e-05, + "loss": 0.4107, + "step": 150, + "task_loss": 0.5675384998321533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4110710620880127, + "epoch": 0.13, + "learning_rate": 4.908827436299964e-05, + "loss": 0.4342, + "step": 151, + "task_loss": 0.7999531030654907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34637752175331116, + "epoch": 0.13, + "learning_rate": 4.908223644487381e-05, + "loss": 0.4092, + "step": 152, + "task_loss": 0.38146457076072693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30415773391723633, + "epoch": 0.13, + "learning_rate": 4.907619852674798e-05, + "loss": 0.4607, + "step": 153, + "task_loss": 0.6748791933059692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6208047866821289, + "epoch": 0.13, + "learning_rate": 4.907016060862215e-05, + "loss": 0.4497, + "step": 154, + "task_loss": 0.5220504403114319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3292478322982788, + "epoch": 0.13, + "learning_rate": 4.9064122690496316e-05, + "loss": 0.4716, + "step": 155, + "task_loss": 0.5408774018287659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3978666663169861, + "epoch": 0.13, + "learning_rate": 4.905808477237049e-05, + "loss": 0.3863, + "step": 156, + "task_loss": 0.3390129506587982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34065455198287964, + "epoch": 0.13, + "learning_rate": 4.905204685424466e-05, + "loss": 0.3771, + "step": 157, + "task_loss": 0.7971680164337158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5941371917724609, + "epoch": 0.13, + "learning_rate": 4.904600893611883e-05, + "loss": 0.5904, + "step": 158, + "task_loss": 0.24042095243930817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25876864790916443, + "epoch": 0.13, + "learning_rate": 4.9039971017993e-05, + "loss": 0.4258, + "step": 159, + "task_loss": 0.19387097656726837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16864356398582458, + "epoch": 0.14, + "learning_rate": 4.903393309986717e-05, + "loss": 0.2852, + "step": 160, + "task_loss": 0.6018144488334656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5536223649978638, + "epoch": 0.14, + "learning_rate": 4.902789518174134e-05, + "loss": 0.3592, + "step": 161, + "task_loss": 0.5275382399559021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2101719081401825, + "epoch": 0.14, + "learning_rate": 4.9021857263615506e-05, + "loss": 0.3061, + "step": 162, + "task_loss": 0.015712212771177292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15919873118400574, + "epoch": 0.14, + "learning_rate": 4.901581934548968e-05, + "loss": 0.2562, + "step": 163, + "task_loss": 0.5559214949607849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32795944809913635, + "epoch": 0.14, + "learning_rate": 4.900978142736385e-05, + "loss": 0.3572, + "step": 164, + "task_loss": 0.9139054417610168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2968144118785858, + "epoch": 0.14, + "learning_rate": 4.9003743509238014e-05, + "loss": 0.3407, + "step": 165, + "task_loss": 0.8456998467445374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4627693295478821, + "epoch": 0.14, + "learning_rate": 4.899770559111219e-05, + "loss": 0.5055, + "step": 166, + "task_loss": 0.9660850167274475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3389841616153717, + "epoch": 0.14, + "learning_rate": 4.8991667672986356e-05, + "loss": 0.3668, + "step": 167, + "task_loss": 1.301005482673645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17484796047210693, + "epoch": 0.14, + "learning_rate": 4.898562975486053e-05, + "loss": 0.3587, + "step": 168, + "task_loss": 0.0901103988289833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2166406214237213, + "epoch": 0.14, + "learning_rate": 4.89795918367347e-05, + "loss": 0.2322, + "step": 169, + "task_loss": 0.026940368115901947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20582215487957, + "epoch": 0.14, + "learning_rate": 4.8973553918608864e-05, + "loss": 0.2233, + "step": 170, + "task_loss": 0.2768481373786926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4102306067943573, + "epoch": 0.14, + "learning_rate": 4.896751600048304e-05, + "loss": 0.2917, + "step": 171, + "task_loss": 0.7136456966400146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17633290588855743, + "epoch": 0.15, + "learning_rate": 4.8961478082357205e-05, + "loss": 0.2741, + "step": 172, + "task_loss": 0.5400669574737549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37040984630584717, + "epoch": 0.15, + "learning_rate": 4.895544016423137e-05, + "loss": 0.3682, + "step": 173, + "task_loss": 0.5297955870628357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36008045077323914, + "epoch": 0.15, + "learning_rate": 4.8949402246105546e-05, + "loss": 0.3865, + "step": 174, + "task_loss": 1.293094515800476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3012641966342926, + "epoch": 0.15, + "learning_rate": 4.8943364327979713e-05, + "loss": 0.3107, + "step": 175, + "task_loss": 0.6071553826332092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16874560713768005, + "epoch": 0.15, + "learning_rate": 4.893732640985389e-05, + "loss": 0.3568, + "step": 176, + "task_loss": 0.48791053891181946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.305328905582428, + "epoch": 0.15, + "learning_rate": 4.8931288491728055e-05, + "loss": 0.3334, + "step": 177, + "task_loss": 0.5327244997024536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19686727225780487, + "epoch": 0.15, + "learning_rate": 4.892525057360222e-05, + "loss": 0.2972, + "step": 178, + "task_loss": 0.7281004190444946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31708723306655884, + "epoch": 0.15, + "learning_rate": 4.8919212655476396e-05, + "loss": 0.3641, + "step": 179, + "task_loss": 0.4216811954975128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35918673872947693, + "epoch": 0.15, + "learning_rate": 4.891317473735056e-05, + "loss": 0.3884, + "step": 180, + "task_loss": 0.689250111579895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25123131275177, + "epoch": 0.15, + "learning_rate": 4.890713681922474e-05, + "loss": 0.356, + "step": 181, + "task_loss": 0.5716760158538818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34370747208595276, + "epoch": 0.15, + "learning_rate": 4.8901098901098904e-05, + "loss": 0.338, + "step": 182, + "task_loss": 0.8569716811180115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19864091277122498, + "epoch": 0.15, + "learning_rate": 4.889506098297307e-05, + "loss": 0.3252, + "step": 183, + "task_loss": 0.659917950630188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31270259618759155, + "epoch": 0.16, + "learning_rate": 4.8889023064847245e-05, + "loss": 0.3916, + "step": 184, + "task_loss": 1.4612730741500854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37377360463142395, + "epoch": 0.16, + "learning_rate": 4.888298514672141e-05, + "loss": 0.5808, + "step": 185, + "task_loss": 0.7126050591468811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29406899213790894, + "epoch": 0.16, + "learning_rate": 4.887694722859558e-05, + "loss": 0.4167, + "step": 186, + "task_loss": 1.1095341444015503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1254437267780304, + "epoch": 0.16, + "learning_rate": 4.8870909310469754e-05, + "loss": 0.2659, + "step": 187, + "task_loss": 0.3575878441333771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4643031358718872, + "epoch": 0.16, + "learning_rate": 4.886487139234392e-05, + "loss": 0.4302, + "step": 188, + "task_loss": 0.6942890286445618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5741887092590332, + "epoch": 0.16, + "learning_rate": 4.885883347421809e-05, + "loss": 0.3795, + "step": 189, + "task_loss": 1.2970455884933472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24287299811840057, + "epoch": 0.16, + "learning_rate": 4.885279555609226e-05, + "loss": 0.2826, + "step": 190, + "task_loss": 0.585664689540863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1484246850013733, + "epoch": 0.16, + "learning_rate": 4.8846757637966436e-05, + "loss": 0.2874, + "step": 191, + "task_loss": 0.05447469651699066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43349575996398926, + "epoch": 0.16, + "learning_rate": 4.88407197198406e-05, + "loss": 0.2905, + "step": 192, + "task_loss": 0.7433083057403564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3356369733810425, + "epoch": 0.16, + "learning_rate": 4.883468180171477e-05, + "loss": 0.404, + "step": 193, + "task_loss": 0.25878268480300903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44204750657081604, + "epoch": 0.16, + "learning_rate": 4.8828643883588944e-05, + "loss": 0.3698, + "step": 194, + "task_loss": 0.44937726855278015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5335066914558411, + "epoch": 0.16, + "learning_rate": 4.882260596546311e-05, + "loss": 0.3701, + "step": 195, + "task_loss": 0.5461884140968323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19670599699020386, + "epoch": 0.17, + "learning_rate": 4.881656804733728e-05, + "loss": 0.2998, + "step": 196, + "task_loss": 0.8536654710769653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7039417028427124, + "epoch": 0.17, + "learning_rate": 4.881053012921145e-05, + "loss": 0.4345, + "step": 197, + "task_loss": 0.5822523236274719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43034815788269043, + "epoch": 0.17, + "learning_rate": 4.880449221108562e-05, + "loss": 0.4119, + "step": 198, + "task_loss": 0.6032153964042664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2675672769546509, + "epoch": 0.17, + "learning_rate": 4.879845429295979e-05, + "loss": 0.4151, + "step": 199, + "task_loss": 0.6427316665649414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5726067423820496, + "epoch": 0.17, + "learning_rate": 4.879241637483396e-05, + "loss": 0.4373, + "step": 200, + "task_loss": 1.5375341176986694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3098316490650177, + "epoch": 0.17, + "learning_rate": 4.8786378456708135e-05, + "loss": 0.2882, + "step": 201, + "task_loss": 0.7666797637939453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37163761258125305, + "epoch": 0.17, + "learning_rate": 4.8780340538582295e-05, + "loss": 0.3448, + "step": 202, + "task_loss": 0.5320181846618652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16711682081222534, + "epoch": 0.17, + "learning_rate": 4.877430262045647e-05, + "loss": 0.3622, + "step": 203, + "task_loss": 0.19219565391540527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2333192229270935, + "epoch": 0.17, + "learning_rate": 4.876826470233064e-05, + "loss": 0.3458, + "step": 204, + "task_loss": 0.5413380861282349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.573334813117981, + "epoch": 0.17, + "learning_rate": 4.8762226784204803e-05, + "loss": 0.3731, + "step": 205, + "task_loss": 0.18013392388820648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17017324268817902, + "epoch": 0.17, + "learning_rate": 4.875618886607898e-05, + "loss": 0.337, + "step": 206, + "task_loss": 0.2642764151096344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3357846438884735, + "epoch": 0.17, + "learning_rate": 4.875015094795315e-05, + "loss": 0.3443, + "step": 207, + "task_loss": 0.8511983156204224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2904583811759949, + "epoch": 0.18, + "learning_rate": 4.874411302982732e-05, + "loss": 0.3644, + "step": 208, + "task_loss": 0.6950564384460449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6159971356391907, + "epoch": 0.18, + "learning_rate": 4.8738075111701486e-05, + "loss": 0.417, + "step": 209, + "task_loss": 1.0051993131637573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2751395106315613, + "epoch": 0.18, + "learning_rate": 4.873203719357566e-05, + "loss": 0.259, + "step": 210, + "task_loss": 0.44548681378364563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14952373504638672, + "epoch": 0.18, + "learning_rate": 4.872599927544983e-05, + "loss": 0.3419, + "step": 211, + "task_loss": 0.2996128797531128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.240716353058815, + "epoch": 0.18, + "learning_rate": 4.8719961357323994e-05, + "loss": 0.3574, + "step": 212, + "task_loss": 0.5914297699928284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3645804524421692, + "epoch": 0.18, + "learning_rate": 4.871392343919817e-05, + "loss": 0.3481, + "step": 213, + "task_loss": 0.07440241426229477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3713075816631317, + "epoch": 0.18, + "learning_rate": 4.8707885521072335e-05, + "loss": 0.4508, + "step": 214, + "task_loss": 1.505163550376892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4446750581264496, + "epoch": 0.18, + "learning_rate": 4.87018476029465e-05, + "loss": 0.4169, + "step": 215, + "task_loss": 0.6080370545387268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42608070373535156, + "epoch": 0.18, + "learning_rate": 4.8695809684820676e-05, + "loss": 0.34, + "step": 216, + "task_loss": 1.165233850479126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44962722063064575, + "epoch": 0.18, + "learning_rate": 4.868977176669485e-05, + "loss": 0.4527, + "step": 217, + "task_loss": 1.359510898590088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.347635954618454, + "epoch": 0.18, + "learning_rate": 4.868373384856901e-05, + "loss": 0.463, + "step": 218, + "task_loss": 0.4809846580028534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24371974170207977, + "epoch": 0.19, + "learning_rate": 4.8677695930443185e-05, + "loss": 0.3572, + "step": 219, + "task_loss": 0.7009860277175903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25305095314979553, + "epoch": 0.19, + "learning_rate": 4.867165801231736e-05, + "loss": 0.3178, + "step": 220, + "task_loss": 0.6486716866493225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21840791404247284, + "epoch": 0.19, + "learning_rate": 4.8665620094191526e-05, + "loss": 0.2503, + "step": 221, + "task_loss": 0.2857961356639862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22764679789543152, + "epoch": 0.19, + "learning_rate": 4.865958217606569e-05, + "loss": 0.2923, + "step": 222, + "task_loss": 1.1480791568756104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6173543334007263, + "epoch": 0.19, + "learning_rate": 4.865354425793987e-05, + "loss": 0.441, + "step": 223, + "task_loss": 0.7470206618309021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.557291567325592, + "epoch": 0.19, + "learning_rate": 4.8647506339814034e-05, + "loss": 0.4489, + "step": 224, + "task_loss": 0.689177930355072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42682474851608276, + "epoch": 0.19, + "learning_rate": 4.86414684216882e-05, + "loss": 0.3617, + "step": 225, + "task_loss": 0.6550009250640869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1677248775959015, + "epoch": 0.19, + "learning_rate": 4.8635430503562375e-05, + "loss": 0.2983, + "step": 226, + "task_loss": 0.2665528357028961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3583166003227234, + "epoch": 0.19, + "learning_rate": 4.862939258543654e-05, + "loss": 0.4007, + "step": 227, + "task_loss": 0.4113485515117645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22209115326404572, + "epoch": 0.19, + "learning_rate": 4.862335466731071e-05, + "loss": 0.4767, + "step": 228, + "task_loss": 1.2220337390899658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.591152012348175, + "epoch": 0.19, + "learning_rate": 4.8617316749184884e-05, + "loss": 0.4903, + "step": 229, + "task_loss": 1.7407927513122559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29130685329437256, + "epoch": 0.19, + "learning_rate": 4.861127883105905e-05, + "loss": 0.3066, + "step": 230, + "task_loss": 0.8779850602149963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27365842461586, + "epoch": 0.2, + "learning_rate": 4.8605240912933225e-05, + "loss": 0.3202, + "step": 231, + "task_loss": 0.9441163539886475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2616477906703949, + "epoch": 0.2, + "learning_rate": 4.859920299480739e-05, + "loss": 0.3274, + "step": 232, + "task_loss": 0.8562521934509277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3808269798755646, + "epoch": 0.2, + "learning_rate": 4.8593165076681566e-05, + "loss": 0.2633, + "step": 233, + "task_loss": 0.5619776248931885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28243935108184814, + "epoch": 0.2, + "learning_rate": 4.858712715855573e-05, + "loss": 0.3988, + "step": 234, + "task_loss": 0.32802894711494446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33323073387145996, + "epoch": 0.2, + "learning_rate": 4.85810892404299e-05, + "loss": 0.3971, + "step": 235, + "task_loss": 0.6267164349555969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21033337712287903, + "epoch": 0.2, + "learning_rate": 4.8575051322304074e-05, + "loss": 0.2833, + "step": 236, + "task_loss": 0.4125954806804657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5221607685089111, + "epoch": 0.2, + "learning_rate": 4.856901340417824e-05, + "loss": 0.5004, + "step": 237, + "task_loss": 1.5617070198059082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.198834627866745, + "epoch": 0.2, + "learning_rate": 4.856297548605241e-05, + "loss": 0.5516, + "step": 238, + "task_loss": 1.176544189453125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3844684958457947, + "epoch": 0.2, + "learning_rate": 4.855693756792658e-05, + "loss": 0.4421, + "step": 239, + "task_loss": 0.7043779492378235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27848494052886963, + "epoch": 0.2, + "learning_rate": 4.855089964980075e-05, + "loss": 0.2971, + "step": 240, + "task_loss": 0.2784903347492218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7233941555023193, + "epoch": 0.2, + "learning_rate": 4.8544861731674924e-05, + "loss": 0.4211, + "step": 241, + "task_loss": 0.6842274069786072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2772015631198883, + "epoch": 0.2, + "learning_rate": 4.853882381354909e-05, + "loss": 0.3696, + "step": 242, + "task_loss": 0.4753953516483307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28759765625, + "epoch": 0.21, + "learning_rate": 4.853278589542326e-05, + "loss": 0.3984, + "step": 243, + "task_loss": 0.3085748553276062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45128703117370605, + "epoch": 0.21, + "learning_rate": 4.852674797729743e-05, + "loss": 0.3283, + "step": 244, + "task_loss": 0.7327007055282593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36166709661483765, + "epoch": 0.21, + "learning_rate": 4.85207100591716e-05, + "loss": 0.4065, + "step": 245, + "task_loss": 0.17605814337730408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25296056270599365, + "epoch": 0.21, + "learning_rate": 4.8514672141045766e-05, + "loss": 0.3166, + "step": 246, + "task_loss": 0.524057924747467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3108903765678406, + "epoch": 0.21, + "learning_rate": 4.850863422291994e-05, + "loss": 0.3726, + "step": 247, + "task_loss": 0.6622505187988281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2930804491043091, + "epoch": 0.21, + "learning_rate": 4.850259630479411e-05, + "loss": 0.3462, + "step": 248, + "task_loss": 0.2555572986602783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47002166509628296, + "epoch": 0.21, + "learning_rate": 4.849655838666828e-05, + "loss": 0.388, + "step": 249, + "task_loss": 0.3438532054424286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5938379168510437, + "epoch": 0.21, + "learning_rate": 4.849052046854245e-05, + "loss": 0.3176, + "step": 250, + "task_loss": 0.28426429629325867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4488550126552582, + "epoch": 0.21, + "learning_rate": 4.848448255041662e-05, + "loss": 0.3943, + "step": 251, + "task_loss": 1.217559814453125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21565239131450653, + "epoch": 0.21, + "learning_rate": 4.847844463229079e-05, + "loss": 0.367, + "step": 252, + "task_loss": 0.8230108022689819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20017766952514648, + "epoch": 0.21, + "learning_rate": 4.847240671416496e-05, + "loss": 0.4238, + "step": 253, + "task_loss": 0.6060188412666321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37959274649620056, + "epoch": 0.21, + "learning_rate": 4.846636879603913e-05, + "loss": 0.305, + "step": 254, + "task_loss": 0.9876610040664673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39832544326782227, + "epoch": 0.22, + "learning_rate": 4.84603308779133e-05, + "loss": 0.2947, + "step": 255, + "task_loss": 0.5021925568580627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3502846360206604, + "epoch": 0.22, + "learning_rate": 4.8454292959787465e-05, + "loss": 0.3081, + "step": 256, + "task_loss": 0.6578946709632874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19270899891853333, + "epoch": 0.22, + "learning_rate": 4.844825504166164e-05, + "loss": 0.2989, + "step": 257, + "task_loss": 0.5173934698104858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3227487802505493, + "epoch": 0.22, + "learning_rate": 4.8442217123535806e-05, + "loss": 0.4644, + "step": 258, + "task_loss": 0.6536293625831604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5462640523910522, + "epoch": 0.22, + "learning_rate": 4.8436179205409974e-05, + "loss": 0.3752, + "step": 259, + "task_loss": 1.1624946594238281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3551231026649475, + "epoch": 0.22, + "learning_rate": 4.843014128728415e-05, + "loss": 0.3021, + "step": 260, + "task_loss": 0.11768197268247604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28162717819213867, + "epoch": 0.22, + "learning_rate": 4.842410336915832e-05, + "loss": 0.3605, + "step": 261, + "task_loss": 1.2060271501541138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2618270516395569, + "epoch": 0.22, + "learning_rate": 4.841806545103248e-05, + "loss": 0.4038, + "step": 262, + "task_loss": 0.6179669499397278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5169287323951721, + "epoch": 0.22, + "learning_rate": 4.8412027532906656e-05, + "loss": 0.4219, + "step": 263, + "task_loss": 0.6710100173950195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3042276203632355, + "epoch": 0.22, + "learning_rate": 4.840598961478083e-05, + "loss": 0.3309, + "step": 264, + "task_loss": 0.4049620032310486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4004916548728943, + "epoch": 0.22, + "learning_rate": 4.8399951696655e-05, + "loss": 0.3626, + "step": 265, + "task_loss": 0.7619703412055969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18103118240833282, + "epoch": 0.22, + "learning_rate": 4.8393913778529164e-05, + "loss": 0.3102, + "step": 266, + "task_loss": 0.12635083496570587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3955288231372833, + "epoch": 0.23, + "learning_rate": 4.838787586040334e-05, + "loss": 0.3845, + "step": 267, + "task_loss": 0.8333910703659058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3358483910560608, + "epoch": 0.23, + "learning_rate": 4.8381837942277505e-05, + "loss": 0.3969, + "step": 268, + "task_loss": 0.7327327728271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6183207035064697, + "epoch": 0.23, + "learning_rate": 4.837580002415167e-05, + "loss": 0.4966, + "step": 269, + "task_loss": 0.7672142386436462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4284032881259918, + "epoch": 0.23, + "learning_rate": 4.8369762106025847e-05, + "loss": 0.3588, + "step": 270, + "task_loss": 0.38554489612579346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.298435240983963, + "epoch": 0.23, + "learning_rate": 4.8363724187900014e-05, + "loss": 0.3192, + "step": 271, + "task_loss": 0.8453357219696045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18604081869125366, + "epoch": 0.23, + "learning_rate": 4.835768626977418e-05, + "loss": 0.4136, + "step": 272, + "task_loss": 0.14790509641170502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1609693467617035, + "epoch": 0.23, + "learning_rate": 4.8351648351648355e-05, + "loss": 0.2793, + "step": 273, + "task_loss": 0.21414852142333984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21765820682048798, + "epoch": 0.23, + "learning_rate": 4.834561043352253e-05, + "loss": 0.2814, + "step": 274, + "task_loss": 0.38039907813072205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1675441861152649, + "epoch": 0.23, + "learning_rate": 4.833957251539669e-05, + "loss": 0.2665, + "step": 275, + "task_loss": 0.04698711261153221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4158642888069153, + "epoch": 0.23, + "learning_rate": 4.833353459727086e-05, + "loss": 0.4288, + "step": 276, + "task_loss": 0.3804851770401001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19586452841758728, + "epoch": 0.23, + "learning_rate": 4.832749667914504e-05, + "loss": 0.3039, + "step": 277, + "task_loss": 0.12271113693714142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4234282672405243, + "epoch": 0.23, + "learning_rate": 4.83214587610192e-05, + "loss": 0.3037, + "step": 278, + "task_loss": 0.3277617394924164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3377179503440857, + "epoch": 0.24, + "learning_rate": 4.831542084289337e-05, + "loss": 0.3565, + "step": 279, + "task_loss": 0.6190507411956787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3592570126056671, + "epoch": 0.24, + "learning_rate": 4.8309382924767545e-05, + "loss": 0.3361, + "step": 280, + "task_loss": 0.5719615817070007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2319561243057251, + "epoch": 0.24, + "learning_rate": 4.830334500664171e-05, + "loss": 0.3792, + "step": 281, + "task_loss": 0.5975136756896973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36890077590942383, + "epoch": 0.24, + "learning_rate": 4.829730708851588e-05, + "loss": 0.3507, + "step": 282, + "task_loss": 0.7881508469581604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30688631534576416, + "epoch": 0.24, + "learning_rate": 4.8291269170390054e-05, + "loss": 0.4851, + "step": 283, + "task_loss": 1.1022228002548218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2913910448551178, + "epoch": 0.24, + "learning_rate": 4.828523125226422e-05, + "loss": 0.2951, + "step": 284, + "task_loss": 0.5692667365074158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3001357913017273, + "epoch": 0.24, + "learning_rate": 4.827919333413839e-05, + "loss": 0.316, + "step": 285, + "task_loss": 0.5103100538253784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19008879363536835, + "epoch": 0.24, + "learning_rate": 4.827315541601256e-05, + "loss": 0.332, + "step": 286, + "task_loss": 0.3047529458999634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3289864957332611, + "epoch": 0.24, + "learning_rate": 4.826711749788673e-05, + "loss": 0.3928, + "step": 287, + "task_loss": 0.48146453499794006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8252094388008118, + "epoch": 0.24, + "learning_rate": 4.8261079579760896e-05, + "loss": 0.4665, + "step": 288, + "task_loss": 0.42987194657325745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3161754608154297, + "epoch": 0.24, + "learning_rate": 4.825504166163507e-05, + "loss": 0.3388, + "step": 289, + "task_loss": 1.4513256549835205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16933982074260712, + "epoch": 0.24, + "learning_rate": 4.8249003743509244e-05, + "loss": 0.2714, + "step": 290, + "task_loss": 0.8262923359870911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37988537549972534, + "epoch": 0.25, + "learning_rate": 4.824296582538341e-05, + "loss": 0.3737, + "step": 291, + "task_loss": 0.469099760055542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25396668910980225, + "epoch": 0.25, + "learning_rate": 4.823692790725758e-05, + "loss": 0.3128, + "step": 292, + "task_loss": 0.22486521303653717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4176521599292755, + "epoch": 0.25, + "learning_rate": 4.823088998913175e-05, + "loss": 0.3963, + "step": 293, + "task_loss": 0.48993009328842163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34238067269325256, + "epoch": 0.25, + "learning_rate": 4.822485207100592e-05, + "loss": 0.3931, + "step": 294, + "task_loss": 0.6426882743835449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19826723635196686, + "epoch": 0.25, + "learning_rate": 4.821881415288009e-05, + "loss": 0.3186, + "step": 295, + "task_loss": 0.23027630150318146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43875911831855774, + "epoch": 0.25, + "learning_rate": 4.821277623475426e-05, + "loss": 0.4149, + "step": 296, + "task_loss": 1.1842646598815918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1716620773077011, + "epoch": 0.25, + "learning_rate": 4.820673831662843e-05, + "loss": 0.3365, + "step": 297, + "task_loss": 0.5584362745285034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4358714520931244, + "epoch": 0.25, + "learning_rate": 4.8200700398502595e-05, + "loss": 0.3862, + "step": 298, + "task_loss": 0.6126980185508728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4216727614402771, + "epoch": 0.25, + "learning_rate": 4.819466248037677e-05, + "loss": 0.3903, + "step": 299, + "task_loss": 0.5612708330154419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46830129623413086, + "epoch": 0.25, + "learning_rate": 4.8188624562250937e-05, + "loss": 0.424, + "step": 300, + "task_loss": 0.6126656532287598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3437400162220001, + "epoch": 0.25, + "learning_rate": 4.818258664412511e-05, + "loss": 0.3715, + "step": 301, + "task_loss": 0.2198963165283203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3458477258682251, + "epoch": 0.26, + "learning_rate": 4.817654872599928e-05, + "loss": 0.3308, + "step": 302, + "task_loss": 0.6421058773994446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3061091899871826, + "epoch": 0.26, + "learning_rate": 4.8170510807873445e-05, + "loss": 0.3606, + "step": 303, + "task_loss": 0.48531925678253174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5407686233520508, + "epoch": 0.26, + "learning_rate": 4.816447288974762e-05, + "loss": 0.3952, + "step": 304, + "task_loss": 1.4983210563659668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5865715146064758, + "epoch": 0.26, + "learning_rate": 4.8158434971621786e-05, + "loss": 0.3531, + "step": 305, + "task_loss": 1.0585517883300781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15689745545387268, + "epoch": 0.26, + "learning_rate": 4.815239705349596e-05, + "loss": 0.319, + "step": 306, + "task_loss": 0.05530761554837227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3637600243091583, + "epoch": 0.26, + "learning_rate": 4.814635913537013e-05, + "loss": 0.2908, + "step": 307, + "task_loss": 1.0135647058486938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3493724465370178, + "epoch": 0.26, + "learning_rate": 4.8140321217244294e-05, + "loss": 0.3799, + "step": 308, + "task_loss": 0.9135453104972839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.398184597492218, + "epoch": 0.26, + "learning_rate": 4.813428329911847e-05, + "loss": 0.2983, + "step": 309, + "task_loss": 0.31708386540412903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30986547470092773, + "epoch": 0.26, + "learning_rate": 4.8128245380992635e-05, + "loss": 0.4664, + "step": 310, + "task_loss": 1.2336593866348267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20752136409282684, + "epoch": 0.26, + "learning_rate": 4.812220746286681e-05, + "loss": 0.3982, + "step": 311, + "task_loss": 0.19107981026172638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20283068716526031, + "epoch": 0.26, + "learning_rate": 4.811616954474098e-05, + "loss": 0.2838, + "step": 312, + "task_loss": 0.1679132580757141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3719440698623657, + "epoch": 0.26, + "learning_rate": 4.8110131626615144e-05, + "loss": 0.2413, + "step": 313, + "task_loss": 0.5856820344924927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24177727103233337, + "epoch": 0.27, + "learning_rate": 4.810409370848932e-05, + "loss": 0.3381, + "step": 314, + "task_loss": 0.473675400018692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30009961128234863, + "epoch": 0.27, + "learning_rate": 4.8098055790363485e-05, + "loss": 0.3153, + "step": 315, + "task_loss": 0.6971793174743652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18764623999595642, + "epoch": 0.27, + "learning_rate": 4.809201787223765e-05, + "loss": 0.3125, + "step": 316, + "task_loss": 0.21867595613002777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4035642743110657, + "epoch": 0.27, + "learning_rate": 4.8085979954111826e-05, + "loss": 0.4733, + "step": 317, + "task_loss": 1.023959755897522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20800375938415527, + "epoch": 0.27, + "learning_rate": 4.807994203598599e-05, + "loss": 0.3641, + "step": 318, + "task_loss": 0.5931673645973206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3392481207847595, + "epoch": 0.27, + "learning_rate": 4.807390411786016e-05, + "loss": 0.384, + "step": 319, + "task_loss": 0.5908573269844055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2010572850704193, + "epoch": 0.27, + "learning_rate": 4.8067866199734334e-05, + "loss": 0.2327, + "step": 320, + "task_loss": 0.5935060977935791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20867067575454712, + "epoch": 0.27, + "learning_rate": 4.806182828160851e-05, + "loss": 0.4388, + "step": 321, + "task_loss": 1.138772964477539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21708956360816956, + "epoch": 0.27, + "learning_rate": 4.8055790363482676e-05, + "loss": 0.3001, + "step": 322, + "task_loss": 0.6124125123023987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4039044678211212, + "epoch": 0.27, + "learning_rate": 4.804975244535684e-05, + "loss": 0.3187, + "step": 323, + "task_loss": 1.301652431488037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29664894938468933, + "epoch": 0.27, + "learning_rate": 4.804371452723102e-05, + "loss": 0.4175, + "step": 324, + "task_loss": 0.5054799914360046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25232619047164917, + "epoch": 0.27, + "learning_rate": 4.8037676609105184e-05, + "loss": 0.45, + "step": 325, + "task_loss": 0.25673484802246094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24607592821121216, + "epoch": 0.28, + "learning_rate": 4.803163869097935e-05, + "loss": 0.2717, + "step": 326, + "task_loss": 0.6230039596557617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32535046339035034, + "epoch": 0.28, + "learning_rate": 4.8025600772853525e-05, + "loss": 0.3099, + "step": 327, + "task_loss": 0.92620849609375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2036980390548706, + "epoch": 0.28, + "learning_rate": 4.801956285472769e-05, + "loss": 0.2848, + "step": 328, + "task_loss": 0.4006662368774414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30950790643692017, + "epoch": 0.28, + "learning_rate": 4.801352493660186e-05, + "loss": 0.4279, + "step": 329, + "task_loss": 0.5127992033958435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49542707204818726, + "epoch": 0.28, + "learning_rate": 4.800748701847603e-05, + "loss": 0.4268, + "step": 330, + "task_loss": 0.32095232605934143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2551426589488983, + "epoch": 0.28, + "learning_rate": 4.800144910035021e-05, + "loss": 0.3823, + "step": 331, + "task_loss": 0.404095321893692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3106461763381958, + "epoch": 0.28, + "learning_rate": 4.799541118222437e-05, + "loss": 0.2702, + "step": 332, + "task_loss": 0.36216187477111816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20378345251083374, + "epoch": 0.28, + "learning_rate": 4.798937326409854e-05, + "loss": 0.2835, + "step": 333, + "task_loss": 0.2527221441268921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24155229330062866, + "epoch": 0.28, + "learning_rate": 4.7983335345972716e-05, + "loss": 0.3078, + "step": 334, + "task_loss": 0.6740986108779907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2674604058265686, + "epoch": 0.28, + "learning_rate": 4.7977297427846876e-05, + "loss": 0.2188, + "step": 335, + "task_loss": 0.6719998121261597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18219339847564697, + "epoch": 0.28, + "learning_rate": 4.797125950972105e-05, + "loss": 0.2772, + "step": 336, + "task_loss": 0.08983666449785233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2661128044128418, + "epoch": 0.28, + "learning_rate": 4.7965221591595224e-05, + "loss": 0.3587, + "step": 337, + "task_loss": 0.9328497648239136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4267786145210266, + "epoch": 0.29, + "learning_rate": 4.795918367346939e-05, + "loss": 0.4127, + "step": 338, + "task_loss": 0.7519228458404541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33033639192581177, + "epoch": 0.29, + "learning_rate": 4.795314575534356e-05, + "loss": 0.3252, + "step": 339, + "task_loss": 0.6892683506011963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23219440877437592, + "epoch": 0.29, + "learning_rate": 4.794710783721773e-05, + "loss": 0.3117, + "step": 340, + "task_loss": 0.8769976496696472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30014723539352417, + "epoch": 0.29, + "learning_rate": 4.79410699190919e-05, + "loss": 0.3359, + "step": 341, + "task_loss": 0.37657225131988525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22581475973129272, + "epoch": 0.29, + "learning_rate": 4.793503200096607e-05, + "loss": 0.2733, + "step": 342, + "task_loss": 0.2080300748348236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4027292728424072, + "epoch": 0.29, + "learning_rate": 4.792899408284024e-05, + "loss": 0.4094, + "step": 343, + "task_loss": 0.6712422370910645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3460029363632202, + "epoch": 0.29, + "learning_rate": 4.792295616471441e-05, + "loss": 0.3638, + "step": 344, + "task_loss": 0.9514648914337158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27742084860801697, + "epoch": 0.29, + "learning_rate": 4.7916918246588575e-05, + "loss": 0.3456, + "step": 345, + "task_loss": 1.0332428216934204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2522352337837219, + "epoch": 0.29, + "learning_rate": 4.791088032846275e-05, + "loss": 0.2361, + "step": 346, + "task_loss": 0.3564586639404297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2361924797296524, + "epoch": 0.29, + "learning_rate": 4.790484241033692e-05, + "loss": 0.2914, + "step": 347, + "task_loss": 0.4369828999042511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24098509550094604, + "epoch": 0.29, + "learning_rate": 4.789880449221108e-05, + "loss": 0.3358, + "step": 348, + "task_loss": 0.41343551874160767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3081885874271393, + "epoch": 0.29, + "learning_rate": 4.789276657408526e-05, + "loss": 0.3309, + "step": 349, + "task_loss": 0.05100584030151367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31949707865715027, + "epoch": 0.3, + "learning_rate": 4.788672865595943e-05, + "loss": 0.4014, + "step": 350, + "task_loss": 0.18634140491485596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16327182948589325, + "epoch": 0.3, + "learning_rate": 4.78806907378336e-05, + "loss": 0.3068, + "step": 351, + "task_loss": 0.5471976399421692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3204161524772644, + "epoch": 0.3, + "learning_rate": 4.7874652819707766e-05, + "loss": 0.3572, + "step": 352, + "task_loss": 0.9759595990180969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3595246374607086, + "epoch": 0.3, + "learning_rate": 4.786861490158194e-05, + "loss": 0.4123, + "step": 353, + "task_loss": 0.3917081952095032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2728806734085083, + "epoch": 0.3, + "learning_rate": 4.786257698345611e-05, + "loss": 0.3699, + "step": 354, + "task_loss": 0.2402394860982895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2754240334033966, + "epoch": 0.3, + "learning_rate": 4.7856539065330274e-05, + "loss": 0.3278, + "step": 355, + "task_loss": 0.15423518419265747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14029163122177124, + "epoch": 0.3, + "learning_rate": 4.785050114720445e-05, + "loss": 0.3902, + "step": 356, + "task_loss": 0.8633174300193787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4380922317504883, + "epoch": 0.3, + "learning_rate": 4.7844463229078615e-05, + "loss": 0.3097, + "step": 357, + "task_loss": 0.24112439155578613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3287860155105591, + "epoch": 0.3, + "learning_rate": 4.783842531095278e-05, + "loss": 0.2758, + "step": 358, + "task_loss": 0.789581835269928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3519061803817749, + "epoch": 0.3, + "learning_rate": 4.7832387392826956e-05, + "loss": 0.3612, + "step": 359, + "task_loss": 0.6147396564483643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6181222200393677, + "epoch": 0.3, + "learning_rate": 4.7826349474701123e-05, + "loss": 0.3759, + "step": 360, + "task_loss": 0.5621562004089355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39813950657844543, + "epoch": 0.3, + "learning_rate": 4.782031155657529e-05, + "loss": 0.4228, + "step": 361, + "task_loss": 1.0186234712600708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4485042691230774, + "epoch": 0.31, + "learning_rate": 4.7814273638449465e-05, + "loss": 0.3477, + "step": 362, + "task_loss": 0.8557292819023132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12397646903991699, + "epoch": 0.31, + "learning_rate": 4.780823572032364e-05, + "loss": 0.3689, + "step": 363, + "task_loss": 0.6793264150619507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3271588087081909, + "epoch": 0.31, + "learning_rate": 4.7802197802197806e-05, + "loss": 0.4006, + "step": 364, + "task_loss": 1.4454721212387085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3347281217575073, + "epoch": 0.31, + "learning_rate": 4.779615988407197e-05, + "loss": 0.3006, + "step": 365, + "task_loss": 0.08478929102420807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.10874610394239426, + "epoch": 0.31, + "learning_rate": 4.779012196594615e-05, + "loss": 0.3392, + "step": 366, + "task_loss": 0.298480749130249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47120529413223267, + "epoch": 0.31, + "learning_rate": 4.7784084047820314e-05, + "loss": 0.4524, + "step": 367, + "task_loss": 0.9685383439064026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3274350166320801, + "epoch": 0.31, + "learning_rate": 4.777804612969448e-05, + "loss": 0.3685, + "step": 368, + "task_loss": 0.6706613302230835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3334848880767822, + "epoch": 0.31, + "learning_rate": 4.7772008211568655e-05, + "loss": 0.3914, + "step": 369, + "task_loss": 0.2539215385913849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5926135182380676, + "epoch": 0.31, + "learning_rate": 4.776597029344282e-05, + "loss": 0.4606, + "step": 370, + "task_loss": 0.6668665409088135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37934818863868713, + "epoch": 0.31, + "learning_rate": 4.775993237531699e-05, + "loss": 0.2906, + "step": 371, + "task_loss": 0.030765190720558167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17070582509040833, + "epoch": 0.31, + "learning_rate": 4.7753894457191163e-05, + "loss": 0.3119, + "step": 372, + "task_loss": 0.5617129802703857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19154563546180725, + "epoch": 0.32, + "learning_rate": 4.774785653906533e-05, + "loss": 0.2885, + "step": 373, + "task_loss": 0.3278349041938782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12602773308753967, + "epoch": 0.32, + "learning_rate": 4.7741818620939505e-05, + "loss": 0.331, + "step": 374, + "task_loss": 0.08748903125524521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26439401507377625, + "epoch": 0.32, + "learning_rate": 4.773578070281367e-05, + "loss": 0.3115, + "step": 375, + "task_loss": 0.3877311646938324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38234806060791016, + "epoch": 0.32, + "learning_rate": 4.772974278468784e-05, + "loss": 0.342, + "step": 376, + "task_loss": 0.30214378237724304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2441001534461975, + "epoch": 0.32, + "learning_rate": 4.772370486656201e-05, + "loss": 0.345, + "step": 377, + "task_loss": 0.33088675141334534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34262794256210327, + "epoch": 0.32, + "learning_rate": 4.771766694843618e-05, + "loss": 0.2664, + "step": 378, + "task_loss": 0.3423719108104706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.372978150844574, + "epoch": 0.32, + "learning_rate": 4.7711629030310354e-05, + "loss": 0.3085, + "step": 379, + "task_loss": 0.1712331622838974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.264641672372818, + "epoch": 0.32, + "learning_rate": 4.770559111218452e-05, + "loss": 0.2919, + "step": 380, + "task_loss": 0.6030244827270508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42999929189682007, + "epoch": 0.32, + "learning_rate": 4.769955319405869e-05, + "loss": 0.3403, + "step": 381, + "task_loss": 0.6663162708282471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38923925161361694, + "epoch": 0.32, + "learning_rate": 4.769351527593286e-05, + "loss": 0.3191, + "step": 382, + "task_loss": 0.7168391346931458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42394357919692993, + "epoch": 0.32, + "learning_rate": 4.768747735780703e-05, + "loss": 0.3547, + "step": 383, + "task_loss": 1.1611169576644897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33820244669914246, + "epoch": 0.32, + "learning_rate": 4.7681439439681204e-05, + "loss": 0.3872, + "step": 384, + "task_loss": 0.31316763162612915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34335267543792725, + "epoch": 0.33, + "learning_rate": 4.767540152155537e-05, + "loss": 0.4133, + "step": 385, + "task_loss": 0.7624664306640625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3195539116859436, + "epoch": 0.33, + "learning_rate": 4.766936360342954e-05, + "loss": 0.3734, + "step": 386, + "task_loss": 0.3183731436729431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3304663896560669, + "epoch": 0.33, + "learning_rate": 4.766332568530371e-05, + "loss": 0.4013, + "step": 387, + "task_loss": 0.44702717661857605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4347894787788391, + "epoch": 0.33, + "learning_rate": 4.765728776717788e-05, + "loss": 0.4846, + "step": 388, + "task_loss": 0.7946614027023315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3409498631954193, + "epoch": 0.33, + "learning_rate": 4.7651249849052046e-05, + "loss": 0.4137, + "step": 389, + "task_loss": 0.9257267117500305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4376046657562256, + "epoch": 0.33, + "learning_rate": 4.764521193092622e-05, + "loss": 0.3217, + "step": 390, + "task_loss": 0.75620436668396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47691571712493896, + "epoch": 0.33, + "learning_rate": 4.763917401280039e-05, + "loss": 0.2519, + "step": 391, + "task_loss": 0.6885778903961182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3625657260417938, + "epoch": 0.33, + "learning_rate": 4.7633136094674555e-05, + "loss": 0.5033, + "step": 392, + "task_loss": 0.3627155125141144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26669323444366455, + "epoch": 0.33, + "learning_rate": 4.762709817654873e-05, + "loss": 0.4066, + "step": 393, + "task_loss": 1.4566916227340698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30266720056533813, + "epoch": 0.33, + "learning_rate": 4.76210602584229e-05, + "loss": 0.3726, + "step": 394, + "task_loss": 0.5834892392158508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21502426266670227, + "epoch": 0.33, + "learning_rate": 4.761502234029707e-05, + "loss": 0.2828, + "step": 395, + "task_loss": 0.6627023220062256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3386573791503906, + "epoch": 0.33, + "learning_rate": 4.760898442217124e-05, + "loss": 0.3402, + "step": 396, + "task_loss": 0.32269924879074097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3740113377571106, + "epoch": 0.34, + "learning_rate": 4.760294650404541e-05, + "loss": 0.3358, + "step": 397, + "task_loss": 0.4154449701309204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4196130037307739, + "epoch": 0.34, + "learning_rate": 4.759690858591958e-05, + "loss": 0.4343, + "step": 398, + "task_loss": 0.5105347633361816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2441515177488327, + "epoch": 0.34, + "learning_rate": 4.7590870667793745e-05, + "loss": 0.358, + "step": 399, + "task_loss": 0.4545292854309082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3539087772369385, + "epoch": 0.34, + "learning_rate": 4.758483274966792e-05, + "loss": 0.2667, + "step": 400, + "task_loss": 0.9259809851646423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47437068819999695, + "epoch": 0.34, + "learning_rate": 4.7578794831542086e-05, + "loss": 0.4189, + "step": 401, + "task_loss": 1.0032349824905396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5229030847549438, + "epoch": 0.34, + "learning_rate": 4.7572756913416254e-05, + "loss": 0.2971, + "step": 402, + "task_loss": 0.8462005853652954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3757791817188263, + "epoch": 0.34, + "learning_rate": 4.756671899529043e-05, + "loss": 0.2793, + "step": 403, + "task_loss": 0.1159130185842514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3542606830596924, + "epoch": 0.34, + "learning_rate": 4.75606810771646e-05, + "loss": 0.3693, + "step": 404, + "task_loss": 0.8220679759979248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4766911566257477, + "epoch": 0.34, + "learning_rate": 4.755464315903876e-05, + "loss": 0.3797, + "step": 405, + "task_loss": 0.518310010433197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18795335292816162, + "epoch": 0.34, + "learning_rate": 4.7548605240912936e-05, + "loss": 0.3843, + "step": 406, + "task_loss": 0.39841240644454956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3532783091068268, + "epoch": 0.34, + "learning_rate": 4.754256732278711e-05, + "loss": 0.4428, + "step": 407, + "task_loss": 0.385682076215744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37540656328201294, + "epoch": 0.34, + "learning_rate": 4.753652940466127e-05, + "loss": 0.3145, + "step": 408, + "task_loss": 0.7418531179428101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25664040446281433, + "epoch": 0.35, + "learning_rate": 4.7530491486535444e-05, + "loss": 0.3971, + "step": 409, + "task_loss": 0.14077523350715637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24709947407245636, + "epoch": 0.35, + "learning_rate": 4.752445356840962e-05, + "loss": 0.3414, + "step": 410, + "task_loss": 0.06836795061826706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44139283895492554, + "epoch": 0.35, + "learning_rate": 4.751841565028378e-05, + "loss": 0.3348, + "step": 411, + "task_loss": 0.5904854536056519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18676511943340302, + "epoch": 0.35, + "learning_rate": 4.751237773215795e-05, + "loss": 0.3647, + "step": 412, + "task_loss": 0.00229077716358006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20302972197532654, + "epoch": 0.35, + "learning_rate": 4.7506339814032126e-05, + "loss": 0.2833, + "step": 413, + "task_loss": 0.6538112163543701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1949908286333084, + "epoch": 0.35, + "learning_rate": 4.7500301895906294e-05, + "loss": 0.4827, + "step": 414, + "task_loss": 1.1799983978271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.643801212310791, + "epoch": 0.35, + "learning_rate": 4.749426397778046e-05, + "loss": 0.5334, + "step": 415, + "task_loss": 0.4353008568286896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46924811601638794, + "epoch": 0.35, + "learning_rate": 4.7488226059654635e-05, + "loss": 0.3695, + "step": 416, + "task_loss": 0.025507470592856407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.403201162815094, + "epoch": 0.35, + "learning_rate": 4.74821881415288e-05, + "loss": 0.4249, + "step": 417, + "task_loss": 0.214700847864151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30432790517807007, + "epoch": 0.35, + "learning_rate": 4.747615022340297e-05, + "loss": 0.3563, + "step": 418, + "task_loss": 0.5774965286254883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35530805587768555, + "epoch": 0.35, + "learning_rate": 4.747011230527714e-05, + "loss": 0.333, + "step": 419, + "task_loss": 0.6179263591766357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26833927631378174, + "epoch": 0.35, + "learning_rate": 4.746407438715132e-05, + "loss": 0.4482, + "step": 420, + "task_loss": 0.7877441644668579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46565455198287964, + "epoch": 0.36, + "learning_rate": 4.745803646902548e-05, + "loss": 0.401, + "step": 421, + "task_loss": 0.6897768378257751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15110769867897034, + "epoch": 0.36, + "learning_rate": 4.745199855089965e-05, + "loss": 0.2966, + "step": 422, + "task_loss": 0.06402797996997833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3630506098270416, + "epoch": 0.36, + "learning_rate": 4.7445960632773825e-05, + "loss": 0.3753, + "step": 423, + "task_loss": 0.6175591945648193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32583174109458923, + "epoch": 0.36, + "learning_rate": 4.743992271464799e-05, + "loss": 0.341, + "step": 424, + "task_loss": 0.8466601371765137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2001602053642273, + "epoch": 0.36, + "learning_rate": 4.743388479652216e-05, + "loss": 0.2579, + "step": 425, + "task_loss": 0.07614468038082123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33632713556289673, + "epoch": 0.36, + "learning_rate": 4.7427846878396334e-05, + "loss": 0.4204, + "step": 426, + "task_loss": 1.6080766916275024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27337777614593506, + "epoch": 0.36, + "learning_rate": 4.74218089602705e-05, + "loss": 0.3174, + "step": 427, + "task_loss": 1.8618650436401367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30001145601272583, + "epoch": 0.36, + "learning_rate": 4.741577104214467e-05, + "loss": 0.405, + "step": 428, + "task_loss": 1.3293850421905518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19100527465343475, + "epoch": 0.36, + "learning_rate": 4.740973312401884e-05, + "loss": 0.3861, + "step": 429, + "task_loss": 0.8412354588508606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1726417988538742, + "epoch": 0.36, + "learning_rate": 4.740369520589301e-05, + "loss": 0.3602, + "step": 430, + "task_loss": 1.2546541690826416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27611637115478516, + "epoch": 0.36, + "learning_rate": 4.7397657287767176e-05, + "loss": 0.4678, + "step": 431, + "task_loss": 0.7354687452316284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30049943923950195, + "epoch": 0.36, + "learning_rate": 4.739161936964135e-05, + "loss": 0.3634, + "step": 432, + "task_loss": 0.47865769267082214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40666812658309937, + "epoch": 0.37, + "learning_rate": 4.738558145151552e-05, + "loss": 0.3142, + "step": 433, + "task_loss": 0.4740898907184601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28262925148010254, + "epoch": 0.37, + "learning_rate": 4.737954353338969e-05, + "loss": 0.3382, + "step": 434, + "task_loss": 0.7306826114654541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2715093195438385, + "epoch": 0.37, + "learning_rate": 4.737350561526386e-05, + "loss": 0.2396, + "step": 435, + "task_loss": 0.12085134536027908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.495231032371521, + "epoch": 0.37, + "learning_rate": 4.736746769713803e-05, + "loss": 0.3749, + "step": 436, + "task_loss": 0.5271447896957397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2588498592376709, + "epoch": 0.37, + "learning_rate": 4.73614297790122e-05, + "loss": 0.3323, + "step": 437, + "task_loss": 1.2773017883300781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3426035940647125, + "epoch": 0.37, + "learning_rate": 4.735539186088637e-05, + "loss": 0.3988, + "step": 438, + "task_loss": 1.2968825101852417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3027150630950928, + "epoch": 0.37, + "learning_rate": 4.734935394276054e-05, + "loss": 0.3351, + "step": 439, + "task_loss": 0.8464741110801697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28821468353271484, + "epoch": 0.37, + "learning_rate": 4.734331602463471e-05, + "loss": 0.4304, + "step": 440, + "task_loss": 0.5745480060577393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22773291170597076, + "epoch": 0.37, + "learning_rate": 4.7337278106508875e-05, + "loss": 0.3224, + "step": 441, + "task_loss": 0.12265873700380325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2685242295265198, + "epoch": 0.37, + "learning_rate": 4.733124018838305e-05, + "loss": 0.2533, + "step": 442, + "task_loss": 0.9167185425758362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3029676675796509, + "epoch": 0.37, + "learning_rate": 4.7325202270257216e-05, + "loss": 0.3903, + "step": 443, + "task_loss": 0.4724733829498291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34784698486328125, + "epoch": 0.38, + "learning_rate": 4.731916435213139e-05, + "loss": 0.4061, + "step": 444, + "task_loss": 0.4826476573944092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4083378314971924, + "epoch": 0.38, + "learning_rate": 4.731312643400556e-05, + "loss": 0.4403, + "step": 445, + "task_loss": 0.7389659285545349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19734838604927063, + "epoch": 0.38, + "learning_rate": 4.7307088515879725e-05, + "loss": 0.2514, + "step": 446, + "task_loss": 0.36828845739364624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2308603823184967, + "epoch": 0.38, + "learning_rate": 4.73010505977539e-05, + "loss": 0.3262, + "step": 447, + "task_loss": 0.8120452165603638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22636263072490692, + "epoch": 0.38, + "learning_rate": 4.7295012679628066e-05, + "loss": 0.5631, + "step": 448, + "task_loss": 0.8329577445983887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.435597687959671, + "epoch": 0.38, + "learning_rate": 4.728897476150223e-05, + "loss": 0.4089, + "step": 449, + "task_loss": 1.5263049602508545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20194268226623535, + "epoch": 0.38, + "learning_rate": 4.728293684337641e-05, + "loss": 0.3599, + "step": 450, + "task_loss": 0.2185748815536499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18315492570400238, + "epoch": 0.38, + "learning_rate": 4.7276898925250574e-05, + "loss": 0.2685, + "step": 451, + "task_loss": 0.3106297552585602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1951131373643875, + "epoch": 0.38, + "learning_rate": 4.727086100712475e-05, + "loss": 0.2201, + "step": 452, + "task_loss": 0.11870678514242172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19492274522781372, + "epoch": 0.38, + "learning_rate": 4.7264823088998915e-05, + "loss": 0.3888, + "step": 453, + "task_loss": 0.22582919895648956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2240012139081955, + "epoch": 0.38, + "learning_rate": 4.725878517087309e-05, + "loss": 0.3901, + "step": 454, + "task_loss": 0.5290884375572205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18829910457134247, + "epoch": 0.38, + "learning_rate": 4.7252747252747257e-05, + "loss": 0.2296, + "step": 455, + "task_loss": 0.753002941608429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2122923731803894, + "epoch": 0.39, + "learning_rate": 4.7246709334621424e-05, + "loss": 0.307, + "step": 456, + "task_loss": 0.0786176398396492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30417200922966003, + "epoch": 0.39, + "learning_rate": 4.72406714164956e-05, + "loss": 0.2403, + "step": 457, + "task_loss": 0.3950548768043518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14613334834575653, + "epoch": 0.39, + "learning_rate": 4.7234633498369765e-05, + "loss": 0.4295, + "step": 458, + "task_loss": 0.9901432991027832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19673016667366028, + "epoch": 0.39, + "learning_rate": 4.722859558024393e-05, + "loss": 0.3662, + "step": 459, + "task_loss": 0.9414348006248474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2953929305076599, + "epoch": 0.39, + "learning_rate": 4.7222557662118106e-05, + "loss": 0.4012, + "step": 460, + "task_loss": 0.8229372501373291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2553478181362152, + "epoch": 0.39, + "learning_rate": 4.721651974399227e-05, + "loss": 0.3498, + "step": 461, + "task_loss": 1.105999231338501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30003100633621216, + "epoch": 0.39, + "learning_rate": 4.721048182586644e-05, + "loss": 0.3241, + "step": 462, + "task_loss": 0.20932143926620483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29439473152160645, + "epoch": 0.39, + "learning_rate": 4.7204443907740614e-05, + "loss": 0.3477, + "step": 463, + "task_loss": 0.3300442397594452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6319240927696228, + "epoch": 0.39, + "learning_rate": 4.719840598961479e-05, + "loss": 0.3711, + "step": 464, + "task_loss": 0.45971715450286865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37110820412635803, + "epoch": 0.39, + "learning_rate": 4.719236807148895e-05, + "loss": 0.3154, + "step": 465, + "task_loss": 0.47261321544647217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3097112774848938, + "epoch": 0.39, + "learning_rate": 4.718633015336312e-05, + "loss": 0.2648, + "step": 466, + "task_loss": 0.5097870230674744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36473336815834045, + "epoch": 0.39, + "learning_rate": 4.7180292235237297e-05, + "loss": 0.2702, + "step": 467, + "task_loss": 0.4184640049934387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46796947717666626, + "epoch": 0.4, + "learning_rate": 4.7174254317111464e-05, + "loss": 0.3551, + "step": 468, + "task_loss": 0.5642027258872986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16709420084953308, + "epoch": 0.4, + "learning_rate": 4.716821639898563e-05, + "loss": 0.2777, + "step": 469, + "task_loss": 0.03220021724700928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5440744161605835, + "epoch": 0.4, + "learning_rate": 4.7162178480859805e-05, + "loss": 0.4067, + "step": 470, + "task_loss": 0.8898472189903259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37732553482055664, + "epoch": 0.4, + "learning_rate": 4.715614056273397e-05, + "loss": 0.3477, + "step": 471, + "task_loss": 0.9021671414375305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3206155598163605, + "epoch": 0.4, + "learning_rate": 4.715010264460814e-05, + "loss": 0.3562, + "step": 472, + "task_loss": 0.9387491345405579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34356531500816345, + "epoch": 0.4, + "learning_rate": 4.714406472648231e-05, + "loss": 0.3109, + "step": 473, + "task_loss": 0.5287536978721619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3042271137237549, + "epoch": 0.4, + "learning_rate": 4.713802680835648e-05, + "loss": 0.2381, + "step": 474, + "task_loss": 0.3828946650028229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44171178340911865, + "epoch": 0.4, + "learning_rate": 4.713198889023065e-05, + "loss": 0.3581, + "step": 475, + "task_loss": 0.5084564089775085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21827548742294312, + "epoch": 0.4, + "learning_rate": 4.712595097210482e-05, + "loss": 0.3171, + "step": 476, + "task_loss": 0.6665694713592529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22156867384910583, + "epoch": 0.4, + "learning_rate": 4.7119913053978996e-05, + "loss": 0.3437, + "step": 477, + "task_loss": 0.12441891431808472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4872015714645386, + "epoch": 0.4, + "learning_rate": 4.7113875135853156e-05, + "loss": 0.432, + "step": 478, + "task_loss": 1.1194671392440796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26371294260025024, + "epoch": 0.4, + "learning_rate": 4.710783721772733e-05, + "loss": 0.4232, + "step": 479, + "task_loss": 1.1028268337249756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.283963680267334, + "epoch": 0.41, + "learning_rate": 4.7101799299601504e-05, + "loss": 0.3808, + "step": 480, + "task_loss": 0.8595671653747559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4099663496017456, + "epoch": 0.41, + "learning_rate": 4.7095761381475664e-05, + "loss": 0.3359, + "step": 481, + "task_loss": 1.1608103513717651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3045732080936432, + "epoch": 0.41, + "learning_rate": 4.708972346334984e-05, + "loss": 0.3399, + "step": 482, + "task_loss": 0.3181804418563843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34263813495635986, + "epoch": 0.41, + "learning_rate": 4.708368554522401e-05, + "loss": 0.32, + "step": 483, + "task_loss": 0.6213028430938721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2232375591993332, + "epoch": 0.41, + "learning_rate": 4.707764762709818e-05, + "loss": 0.2959, + "step": 484, + "task_loss": 0.37370654940605164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2694361209869385, + "epoch": 0.41, + "learning_rate": 4.7071609708972347e-05, + "loss": 0.3881, + "step": 485, + "task_loss": 1.0189025402069092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2621615529060364, + "epoch": 0.41, + "learning_rate": 4.706557179084652e-05, + "loss": 0.378, + "step": 486, + "task_loss": 0.8326951265335083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19903703033924103, + "epoch": 0.41, + "learning_rate": 4.705953387272069e-05, + "loss": 0.2651, + "step": 487, + "task_loss": 0.6074106693267822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2726070284843445, + "epoch": 0.41, + "learning_rate": 4.7053495954594855e-05, + "loss": 0.2451, + "step": 488, + "task_loss": 0.2530330419540405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27715301513671875, + "epoch": 0.41, + "learning_rate": 4.704745803646903e-05, + "loss": 0.3208, + "step": 489, + "task_loss": 0.4281701445579529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38308075070381165, + "epoch": 0.41, + "learning_rate": 4.7041420118343196e-05, + "loss": 0.4538, + "step": 490, + "task_loss": 0.2543697953224182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3218812346458435, + "epoch": 0.41, + "learning_rate": 4.703538220021736e-05, + "loss": 0.3782, + "step": 491, + "task_loss": 0.6149786710739136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5539299249649048, + "epoch": 0.42, + "learning_rate": 4.702934428209154e-05, + "loss": 0.4913, + "step": 492, + "task_loss": 0.5124923586845398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5299365520477295, + "epoch": 0.42, + "learning_rate": 4.702330636396571e-05, + "loss": 0.3712, + "step": 493, + "task_loss": 0.6752091646194458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34560248255729675, + "epoch": 0.42, + "learning_rate": 4.701726844583988e-05, + "loss": 0.4371, + "step": 494, + "task_loss": 0.14114762842655182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33097970485687256, + "epoch": 0.42, + "learning_rate": 4.7011230527714045e-05, + "loss": 0.4242, + "step": 495, + "task_loss": 0.2937622666358948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36671313643455505, + "epoch": 0.42, + "learning_rate": 4.700519260958822e-05, + "loss": 0.3409, + "step": 496, + "task_loss": 0.9777697324752808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5165556073188782, + "epoch": 0.42, + "learning_rate": 4.6999154691462387e-05, + "loss": 0.3776, + "step": 497, + "task_loss": 0.45647016167640686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44569462537765503, + "epoch": 0.42, + "learning_rate": 4.6993116773336554e-05, + "loss": 0.4087, + "step": 498, + "task_loss": 0.5437862873077393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23247092962265015, + "epoch": 0.42, + "learning_rate": 4.698707885521073e-05, + "loss": 0.2865, + "step": 499, + "task_loss": 0.5779015421867371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36501020193099976, + "epoch": 0.42, + "learning_rate": 4.6981040937084895e-05, + "loss": 0.2977, + "step": 500, + "task_loss": 0.02872415818274021 + }, + { + "epoch": 0.42, + "eval_accuracy": 0.9111683168316832, + "eval_loss": 0.19494682550430298, + "eval_runtime": 338.708, + "eval_samples_per_second": 74.548, + "eval_steps_per_second": 0.585, + "step": 500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3552286624908447, + "epoch": 0.42, + "learning_rate": 4.697500301895906e-05, + "loss": 0.3759, + "step": 501, + "task_loss": 1.1596523523330688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29694682359695435, + "epoch": 0.42, + "learning_rate": 4.6968965100833236e-05, + "loss": 0.341, + "step": 502, + "task_loss": 0.24640515446662903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2870936989784241, + "epoch": 0.42, + "learning_rate": 4.69629271827074e-05, + "loss": 0.3098, + "step": 503, + "task_loss": 0.6115170121192932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26924842596054077, + "epoch": 0.43, + "learning_rate": 4.695688926458158e-05, + "loss": 0.4013, + "step": 504, + "task_loss": 0.8527316451072693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3132873475551605, + "epoch": 0.43, + "learning_rate": 4.6950851346455744e-05, + "loss": 0.2732, + "step": 505, + "task_loss": 0.551632285118103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20017537474632263, + "epoch": 0.43, + "learning_rate": 4.694481342832991e-05, + "loss": 0.2497, + "step": 506, + "task_loss": 0.4739929139614105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4083116352558136, + "epoch": 0.43, + "learning_rate": 4.6938775510204086e-05, + "loss": 0.2903, + "step": 507, + "task_loss": 0.42030954360961914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12615306675434113, + "epoch": 0.43, + "learning_rate": 4.693273759207825e-05, + "loss": 0.2107, + "step": 508, + "task_loss": 0.1609063297510147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29174333810806274, + "epoch": 0.43, + "learning_rate": 4.692669967395243e-05, + "loss": 0.3058, + "step": 509, + "task_loss": 0.7792858481407166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2843620181083679, + "epoch": 0.43, + "learning_rate": 4.6920661755826594e-05, + "loss": 0.3742, + "step": 510, + "task_loss": 0.4710130989551544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30938857793807983, + "epoch": 0.43, + "learning_rate": 4.691462383770076e-05, + "loss": 0.4092, + "step": 511, + "task_loss": 0.7510309219360352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23981353640556335, + "epoch": 0.43, + "learning_rate": 4.6908585919574935e-05, + "loss": 0.4148, + "step": 512, + "task_loss": 0.6449210047721863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4427306056022644, + "epoch": 0.43, + "learning_rate": 4.69025480014491e-05, + "loss": 0.3666, + "step": 513, + "task_loss": 0.3347684442996979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21351030468940735, + "epoch": 0.43, + "learning_rate": 4.6896510083323276e-05, + "loss": 0.3193, + "step": 514, + "task_loss": 1.0636972188949585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2566937804222107, + "epoch": 0.44, + "learning_rate": 4.689047216519744e-05, + "loss": 0.3709, + "step": 515, + "task_loss": 0.7252180576324463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35753583908081055, + "epoch": 0.44, + "learning_rate": 4.688443424707161e-05, + "loss": 0.3494, + "step": 516, + "task_loss": 0.5258875489234924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5033993721008301, + "epoch": 0.44, + "learning_rate": 4.6878396328945784e-05, + "loss": 0.3532, + "step": 517, + "task_loss": 1.105150818824768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17686602473258972, + "epoch": 0.44, + "learning_rate": 4.687235841081995e-05, + "loss": 0.2566, + "step": 518, + "task_loss": 0.40853849053382874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32396629452705383, + "epoch": 0.44, + "learning_rate": 4.686632049269412e-05, + "loss": 0.3318, + "step": 519, + "task_loss": 1.226879358291626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4283577501773834, + "epoch": 0.44, + "learning_rate": 4.686028257456829e-05, + "loss": 0.2728, + "step": 520, + "task_loss": 0.18439847230911255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5258373022079468, + "epoch": 0.44, + "learning_rate": 4.685424465644246e-05, + "loss": 0.4484, + "step": 521, + "task_loss": 0.3932799994945526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2439127266407013, + "epoch": 0.44, + "learning_rate": 4.684820673831663e-05, + "loss": 0.3316, + "step": 522, + "task_loss": 0.43680280447006226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2758333086967468, + "epoch": 0.44, + "learning_rate": 4.68421688201908e-05, + "loss": 0.2995, + "step": 523, + "task_loss": 0.6799713969230652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.575778067111969, + "epoch": 0.44, + "learning_rate": 4.6836130902064975e-05, + "loss": 0.4034, + "step": 524, + "task_loss": 0.8700759410858154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23655471205711365, + "epoch": 0.44, + "learning_rate": 4.683009298393914e-05, + "loss": 0.3195, + "step": 525, + "task_loss": 0.4206683039665222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18472610414028168, + "epoch": 0.44, + "learning_rate": 4.682405506581331e-05, + "loss": 0.3069, + "step": 526, + "task_loss": 0.5916554927825928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19131658971309662, + "epoch": 0.45, + "learning_rate": 4.6818017147687483e-05, + "loss": 0.298, + "step": 527, + "task_loss": 0.8907945156097412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3965601325035095, + "epoch": 0.45, + "learning_rate": 4.681197922956165e-05, + "loss": 0.4328, + "step": 528, + "task_loss": 0.2660832405090332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24752244353294373, + "epoch": 0.45, + "learning_rate": 4.680594131143582e-05, + "loss": 0.2761, + "step": 529, + "task_loss": 0.1341249942779541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.11898336559534073, + "epoch": 0.45, + "learning_rate": 4.679990339330999e-05, + "loss": 0.3044, + "step": 530, + "task_loss": 0.5121662616729736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45424357056617737, + "epoch": 0.45, + "learning_rate": 4.679386547518416e-05, + "loss": 0.3614, + "step": 531, + "task_loss": 0.2839154601097107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3627973198890686, + "epoch": 0.45, + "learning_rate": 4.6787827557058326e-05, + "loss": 0.4045, + "step": 532, + "task_loss": 0.6190841197967529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5654903054237366, + "epoch": 0.45, + "learning_rate": 4.67817896389325e-05, + "loss": 0.412, + "step": 533, + "task_loss": 0.34364384412765503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27257686853408813, + "epoch": 0.45, + "learning_rate": 4.677575172080667e-05, + "loss": 0.3734, + "step": 534, + "task_loss": 1.0794323682785034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20569965243339539, + "epoch": 0.45, + "learning_rate": 4.6769713802680834e-05, + "loss": 0.2292, + "step": 535, + "task_loss": 0.13309672474861145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2676234543323517, + "epoch": 0.45, + "learning_rate": 4.676367588455501e-05, + "loss": 0.3371, + "step": 536, + "task_loss": 0.4787544310092926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3429715037345886, + "epoch": 0.45, + "learning_rate": 4.675763796642918e-05, + "loss": 0.4323, + "step": 537, + "task_loss": 1.0065019130706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2396705150604248, + "epoch": 0.45, + "learning_rate": 4.675160004830334e-05, + "loss": 0.2787, + "step": 538, + "task_loss": 0.6923638582229614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40023157000541687, + "epoch": 0.46, + "learning_rate": 4.674556213017752e-05, + "loss": 0.3016, + "step": 539, + "task_loss": 0.7968195676803589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2712099850177765, + "epoch": 0.46, + "learning_rate": 4.673952421205169e-05, + "loss": 0.3879, + "step": 540, + "task_loss": 0.409343421459198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31481483578681946, + "epoch": 0.46, + "learning_rate": 4.673348629392585e-05, + "loss": 0.3544, + "step": 541, + "task_loss": 0.6697490215301514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2298501580953598, + "epoch": 0.46, + "learning_rate": 4.6727448375800025e-05, + "loss": 0.2573, + "step": 542, + "task_loss": 0.6586454510688782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3335941731929779, + "epoch": 0.46, + "learning_rate": 4.67214104576742e-05, + "loss": 0.2974, + "step": 543, + "task_loss": 1.0023521184921265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46105095744132996, + "epoch": 0.46, + "learning_rate": 4.6715372539548366e-05, + "loss": 0.363, + "step": 544, + "task_loss": 0.6028873920440674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3821551501750946, + "epoch": 0.46, + "learning_rate": 4.670933462142253e-05, + "loss": 0.3881, + "step": 545, + "task_loss": 0.9498618245124817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1347273886203766, + "epoch": 0.46, + "learning_rate": 4.670329670329671e-05, + "loss": 0.2558, + "step": 546, + "task_loss": 0.15808208286762238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27748334407806396, + "epoch": 0.46, + "learning_rate": 4.6697258785170875e-05, + "loss": 0.331, + "step": 547, + "task_loss": 0.5437588691711426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31439855694770813, + "epoch": 0.46, + "learning_rate": 4.669122086704504e-05, + "loss": 0.2868, + "step": 548, + "task_loss": 0.6029009222984314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36170822381973267, + "epoch": 0.46, + "learning_rate": 4.6685182948919216e-05, + "loss": 0.4417, + "step": 549, + "task_loss": 1.6580506563186646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23689739406108856, + "epoch": 0.46, + "learning_rate": 4.667914503079339e-05, + "loss": 0.3143, + "step": 550, + "task_loss": 0.8231652975082397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19411659240722656, + "epoch": 0.47, + "learning_rate": 4.667310711266755e-05, + "loss": 0.2952, + "step": 551, + "task_loss": 0.8502408862113953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17974667251110077, + "epoch": 0.47, + "learning_rate": 4.6667069194541724e-05, + "loss": 0.3763, + "step": 552, + "task_loss": 1.0208184719085693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15332043170928955, + "epoch": 0.47, + "learning_rate": 4.66610312764159e-05, + "loss": 0.3972, + "step": 553, + "task_loss": 1.0831626653671265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16559112071990967, + "epoch": 0.47, + "learning_rate": 4.665499335829006e-05, + "loss": 0.3241, + "step": 554, + "task_loss": 0.13271448016166687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2002086192369461, + "epoch": 0.47, + "learning_rate": 4.664895544016423e-05, + "loss": 0.3304, + "step": 555, + "task_loss": 0.3389005661010742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2827780246734619, + "epoch": 0.47, + "learning_rate": 4.6642917522038406e-05, + "loss": 0.2844, + "step": 556, + "task_loss": 0.3220231235027313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35322192311286926, + "epoch": 0.47, + "learning_rate": 4.6636879603912573e-05, + "loss": 0.3659, + "step": 557, + "task_loss": 1.4571306705474854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1761866956949234, + "epoch": 0.47, + "learning_rate": 4.663084168578674e-05, + "loss": 0.2108, + "step": 558, + "task_loss": 0.1433708518743515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4945336580276489, + "epoch": 0.47, + "learning_rate": 4.6624803767660915e-05, + "loss": 0.3537, + "step": 559, + "task_loss": 0.603844165802002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3818495571613312, + "epoch": 0.47, + "learning_rate": 4.661876584953508e-05, + "loss": 0.3609, + "step": 560, + "task_loss": 0.8040283918380737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4740597903728485, + "epoch": 0.47, + "learning_rate": 4.661272793140925e-05, + "loss": 0.3636, + "step": 561, + "task_loss": 0.45497509837150574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46093040704727173, + "epoch": 0.47, + "learning_rate": 4.660669001328342e-05, + "loss": 0.4507, + "step": 562, + "task_loss": 0.8263192176818848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40767866373062134, + "epoch": 0.48, + "learning_rate": 4.660065209515759e-05, + "loss": 0.3164, + "step": 563, + "task_loss": 0.8162437081336975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33189451694488525, + "epoch": 0.48, + "learning_rate": 4.659461417703176e-05, + "loss": 0.294, + "step": 564, + "task_loss": 1.0901103019714355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36081790924072266, + "epoch": 0.48, + "learning_rate": 4.658857625890593e-05, + "loss": 0.323, + "step": 565, + "task_loss": 0.41519632935523987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2105311155319214, + "epoch": 0.48, + "learning_rate": 4.6582538340780105e-05, + "loss": 0.2567, + "step": 566, + "task_loss": 0.1116391196846962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24808262288570404, + "epoch": 0.48, + "learning_rate": 4.657650042265427e-05, + "loss": 0.4331, + "step": 567, + "task_loss": 1.3634008169174194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15797454118728638, + "epoch": 0.48, + "learning_rate": 4.657046250452844e-05, + "loss": 0.4287, + "step": 568, + "task_loss": 0.07788126170635223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.536435604095459, + "epoch": 0.48, + "learning_rate": 4.6564424586402614e-05, + "loss": 0.4089, + "step": 569, + "task_loss": 0.877366840839386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.454207181930542, + "epoch": 0.48, + "learning_rate": 4.655838666827678e-05, + "loss": 0.3116, + "step": 570, + "task_loss": 0.3642159104347229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20970717072486877, + "epoch": 0.48, + "learning_rate": 4.655234875015095e-05, + "loss": 0.3425, + "step": 571, + "task_loss": 0.6273272633552551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30841565132141113, + "epoch": 0.48, + "learning_rate": 4.654631083202512e-05, + "loss": 0.2941, + "step": 572, + "task_loss": 0.1949205994606018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43708956241607666, + "epoch": 0.48, + "learning_rate": 4.654027291389929e-05, + "loss": 0.3344, + "step": 573, + "task_loss": 0.8806980848312378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21233753859996796, + "epoch": 0.48, + "learning_rate": 4.6534234995773456e-05, + "loss": 0.3196, + "step": 574, + "task_loss": 0.7414767146110535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2978823184967041, + "epoch": 0.49, + "learning_rate": 4.652819707764763e-05, + "loss": 0.4029, + "step": 575, + "task_loss": 0.48946142196655273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4036892056465149, + "epoch": 0.49, + "learning_rate": 4.65221591595218e-05, + "loss": 0.3856, + "step": 576, + "task_loss": 0.436568945646286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2572465240955353, + "epoch": 0.49, + "learning_rate": 4.651612124139597e-05, + "loss": 0.393, + "step": 577, + "task_loss": 1.0481715202331543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3467211127281189, + "epoch": 0.49, + "learning_rate": 4.651008332327014e-05, + "loss": 0.372, + "step": 578, + "task_loss": 0.842318058013916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30919787287712097, + "epoch": 0.49, + "learning_rate": 4.6504045405144306e-05, + "loss": 0.4773, + "step": 579, + "task_loss": 1.1685787439346313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2926180064678192, + "epoch": 0.49, + "learning_rate": 4.649800748701848e-05, + "loss": 0.3399, + "step": 580, + "task_loss": 0.8706581592559814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23800015449523926, + "epoch": 0.49, + "learning_rate": 4.649196956889265e-05, + "loss": 0.327, + "step": 581, + "task_loss": 0.5887655019760132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5023128986358643, + "epoch": 0.49, + "learning_rate": 4.648593165076682e-05, + "loss": 0.4798, + "step": 582, + "task_loss": 1.2295455932617188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24959447979927063, + "epoch": 0.49, + "learning_rate": 4.647989373264099e-05, + "loss": 0.3407, + "step": 583, + "task_loss": 0.19351419806480408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1406758725643158, + "epoch": 0.49, + "learning_rate": 4.6473855814515155e-05, + "loss": 0.3282, + "step": 584, + "task_loss": 1.069644808769226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3988448977470398, + "epoch": 0.49, + "learning_rate": 4.646781789638933e-05, + "loss": 0.4157, + "step": 585, + "task_loss": 1.2030136585235596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21727235615253448, + "epoch": 0.5, + "learning_rate": 4.6461779978263496e-05, + "loss": 0.351, + "step": 586, + "task_loss": 0.13985687494277954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32491010427474976, + "epoch": 0.5, + "learning_rate": 4.645574206013767e-05, + "loss": 0.3861, + "step": 587, + "task_loss": 0.5897377729415894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.526580274105072, + "epoch": 0.5, + "learning_rate": 4.644970414201184e-05, + "loss": 0.3875, + "step": 588, + "task_loss": 0.8688368201255798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19158503413200378, + "epoch": 0.5, + "learning_rate": 4.6443666223886005e-05, + "loss": 0.3536, + "step": 589, + "task_loss": 0.3490241467952728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35543888807296753, + "epoch": 0.5, + "learning_rate": 4.643762830576018e-05, + "loss": 0.3452, + "step": 590, + "task_loss": 0.9998995661735535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1599348485469818, + "epoch": 0.5, + "learning_rate": 4.6431590387634346e-05, + "loss": 0.3203, + "step": 591, + "task_loss": 0.18656522035598755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2544928789138794, + "epoch": 0.5, + "learning_rate": 4.642555246950851e-05, + "loss": 0.3223, + "step": 592, + "task_loss": 0.25684264302253723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2261628806591034, + "epoch": 0.5, + "learning_rate": 4.641951455138269e-05, + "loss": 0.2666, + "step": 593, + "task_loss": 0.2724857032299042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14076194167137146, + "epoch": 0.5, + "learning_rate": 4.6413476633256854e-05, + "loss": 0.2639, + "step": 594, + "task_loss": 0.298418790102005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30404239892959595, + "epoch": 0.5, + "learning_rate": 4.640743871513102e-05, + "loss": 0.3223, + "step": 595, + "task_loss": 0.3522747755050659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.308472216129303, + "epoch": 0.5, + "learning_rate": 4.6401400797005195e-05, + "loss": 0.3031, + "step": 596, + "task_loss": 1.0812458992004395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22775940597057343, + "epoch": 0.5, + "learning_rate": 4.639536287887937e-05, + "loss": 0.3519, + "step": 597, + "task_loss": 0.32643061876296997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3233962059020996, + "epoch": 0.51, + "learning_rate": 4.638932496075353e-05, + "loss": 0.4218, + "step": 598, + "task_loss": 0.7972854375839233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24168744683265686, + "epoch": 0.51, + "learning_rate": 4.6383287042627704e-05, + "loss": 0.3266, + "step": 599, + "task_loss": 0.3753623366355896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4203510582447052, + "epoch": 0.51, + "learning_rate": 4.637724912450188e-05, + "loss": 0.4241, + "step": 600, + "task_loss": 1.0043474435806274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23040008544921875, + "epoch": 0.51, + "learning_rate": 4.6371211206376045e-05, + "loss": 0.4598, + "step": 601, + "task_loss": 1.4317598342895508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2298530638217926, + "epoch": 0.51, + "learning_rate": 4.636517328825021e-05, + "loss": 0.3663, + "step": 602, + "task_loss": 0.6105636358261108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4078867733478546, + "epoch": 0.51, + "learning_rate": 4.6359135370124386e-05, + "loss": 0.2485, + "step": 603, + "task_loss": 0.3292267322540283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5903968811035156, + "epoch": 0.51, + "learning_rate": 4.635309745199855e-05, + "loss": 0.3066, + "step": 604, + "task_loss": 1.089480996131897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29476842284202576, + "epoch": 0.51, + "learning_rate": 4.634705953387272e-05, + "loss": 0.3085, + "step": 605, + "task_loss": 0.45623448491096497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14264142513275146, + "epoch": 0.51, + "learning_rate": 4.6341021615746894e-05, + "loss": 0.2667, + "step": 606, + "task_loss": 0.05452156066894531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4609898626804352, + "epoch": 0.51, + "learning_rate": 4.633498369762107e-05, + "loss": 0.3163, + "step": 607, + "task_loss": 0.44029584527015686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6171891689300537, + "epoch": 0.51, + "learning_rate": 4.632894577949523e-05, + "loss": 0.4315, + "step": 608, + "task_loss": 0.9165663719177246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3013846278190613, + "epoch": 0.51, + "learning_rate": 4.63229078613694e-05, + "loss": 0.3415, + "step": 609, + "task_loss": 0.63967365026474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2746434807777405, + "epoch": 0.52, + "learning_rate": 4.6316869943243576e-05, + "loss": 0.3267, + "step": 610, + "task_loss": 0.5338064432144165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3651568293571472, + "epoch": 0.52, + "learning_rate": 4.631083202511774e-05, + "loss": 0.3393, + "step": 611, + "task_loss": 1.079981803894043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23759052157402039, + "epoch": 0.52, + "learning_rate": 4.630479410699191e-05, + "loss": 0.3861, + "step": 612, + "task_loss": 0.6698994636535645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5082151293754578, + "epoch": 0.52, + "learning_rate": 4.6298756188866085e-05, + "loss": 0.451, + "step": 613, + "task_loss": 0.8963048458099365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2931552231311798, + "epoch": 0.52, + "learning_rate": 4.6292718270740245e-05, + "loss": 0.3718, + "step": 614, + "task_loss": 1.26703941822052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.284406453371048, + "epoch": 0.52, + "learning_rate": 4.628668035261442e-05, + "loss": 0.4436, + "step": 615, + "task_loss": 1.153813362121582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.346671462059021, + "epoch": 0.52, + "learning_rate": 4.628064243448859e-05, + "loss": 0.3822, + "step": 616, + "task_loss": 1.0603476762771606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2777397632598877, + "epoch": 0.52, + "learning_rate": 4.627460451636276e-05, + "loss": 0.3429, + "step": 617, + "task_loss": 1.0770553350448608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5762258768081665, + "epoch": 0.52, + "learning_rate": 4.626856659823693e-05, + "loss": 0.4253, + "step": 618, + "task_loss": 0.914618194103241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4495653808116913, + "epoch": 0.52, + "learning_rate": 4.62625286801111e-05, + "loss": 0.4085, + "step": 619, + "task_loss": 0.21080175042152405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27887922525405884, + "epoch": 0.52, + "learning_rate": 4.625649076198527e-05, + "loss": 0.4165, + "step": 620, + "task_loss": 0.5042296648025513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3461983799934387, + "epoch": 0.52, + "learning_rate": 4.6250452843859436e-05, + "loss": 0.3263, + "step": 621, + "task_loss": 1.0598671436309814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32994264364242554, + "epoch": 0.53, + "learning_rate": 4.624441492573361e-05, + "loss": 0.3668, + "step": 622, + "task_loss": 1.159314513206482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22066111862659454, + "epoch": 0.53, + "learning_rate": 4.6238377007607784e-05, + "loss": 0.3031, + "step": 623, + "task_loss": 0.10674968361854553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43864771723747253, + "epoch": 0.53, + "learning_rate": 4.6232339089481944e-05, + "loss": 0.396, + "step": 624, + "task_loss": 0.5357619524002075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4029865264892578, + "epoch": 0.53, + "learning_rate": 4.622630117135612e-05, + "loss": 0.4925, + "step": 625, + "task_loss": 0.5318986177444458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.205038383603096, + "epoch": 0.53, + "learning_rate": 4.622026325323029e-05, + "loss": 0.2959, + "step": 626, + "task_loss": 1.0308891534805298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6130087375640869, + "epoch": 0.53, + "learning_rate": 4.621422533510446e-05, + "loss": 0.4259, + "step": 627, + "task_loss": 0.47725486755371094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3744767904281616, + "epoch": 0.53, + "learning_rate": 4.6208187416978626e-05, + "loss": 0.3554, + "step": 628, + "task_loss": 0.7097012996673584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2224428802728653, + "epoch": 0.53, + "learning_rate": 4.62021494988528e-05, + "loss": 0.2753, + "step": 629, + "task_loss": 0.8521153926849365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4066242575645447, + "epoch": 0.53, + "learning_rate": 4.619611158072697e-05, + "loss": 0.3714, + "step": 630, + "task_loss": 1.2141882181167603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3314343988895416, + "epoch": 0.53, + "learning_rate": 4.6190073662601135e-05, + "loss": 0.2743, + "step": 631, + "task_loss": 0.4873604476451874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31800585985183716, + "epoch": 0.53, + "learning_rate": 4.618403574447531e-05, + "loss": 0.3455, + "step": 632, + "task_loss": 0.9237346053123474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.579411506652832, + "epoch": 0.53, + "learning_rate": 4.6177997826349476e-05, + "loss": 0.3566, + "step": 633, + "task_loss": 0.38747143745422363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24865508079528809, + "epoch": 0.54, + "learning_rate": 4.617195990822364e-05, + "loss": 0.3163, + "step": 634, + "task_loss": 0.8504195213317871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21511569619178772, + "epoch": 0.54, + "learning_rate": 4.616592199009782e-05, + "loss": 0.3294, + "step": 635, + "task_loss": 0.5050124526023865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2974485158920288, + "epoch": 0.54, + "learning_rate": 4.6159884071971984e-05, + "loss": 0.4025, + "step": 636, + "task_loss": 1.4122283458709717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32975679636001587, + "epoch": 0.54, + "learning_rate": 4.615384615384616e-05, + "loss": 0.3319, + "step": 637, + "task_loss": 0.7846709489822388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25379478931427, + "epoch": 0.54, + "learning_rate": 4.6147808235720325e-05, + "loss": 0.3251, + "step": 638, + "task_loss": 1.0721712112426758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4846215546131134, + "epoch": 0.54, + "learning_rate": 4.61417703175945e-05, + "loss": 0.4351, + "step": 639, + "task_loss": 0.22067594528198242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5852885246276855, + "epoch": 0.54, + "learning_rate": 4.6135732399468666e-05, + "loss": 0.3507, + "step": 640, + "task_loss": 0.4556126296520233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.13487397134304047, + "epoch": 0.54, + "learning_rate": 4.6129694481342834e-05, + "loss": 0.2611, + "step": 641, + "task_loss": 0.19986118376255035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27105093002319336, + "epoch": 0.54, + "learning_rate": 4.612365656321701e-05, + "loss": 0.3615, + "step": 642, + "task_loss": 1.0428903102874756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2862633466720581, + "epoch": 0.54, + "learning_rate": 4.6117618645091175e-05, + "loss": 0.4324, + "step": 643, + "task_loss": 1.042823314666748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17515414953231812, + "epoch": 0.54, + "learning_rate": 4.611158072696534e-05, + "loss": 0.2985, + "step": 644, + "task_loss": 0.28693491220474243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24967364966869354, + "epoch": 0.54, + "learning_rate": 4.6105542808839516e-05, + "loss": 0.3359, + "step": 645, + "task_loss": 0.8768274784088135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2654966115951538, + "epoch": 0.55, + "learning_rate": 4.609950489071368e-05, + "loss": 0.3514, + "step": 646, + "task_loss": 0.2290060669183731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6174731254577637, + "epoch": 0.55, + "learning_rate": 4.609346697258786e-05, + "loss": 0.394, + "step": 647, + "task_loss": 0.34061920642852783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2054482102394104, + "epoch": 0.55, + "learning_rate": 4.6087429054462024e-05, + "loss": 0.4386, + "step": 648, + "task_loss": 0.3258286118507385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40916475653648376, + "epoch": 0.55, + "learning_rate": 4.608139113633619e-05, + "loss": 0.3831, + "step": 649, + "task_loss": 0.4465845227241516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3240598142147064, + "epoch": 0.55, + "learning_rate": 4.6075353218210365e-05, + "loss": 0.3164, + "step": 650, + "task_loss": 0.9258898496627808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.314564049243927, + "epoch": 0.55, + "learning_rate": 4.606931530008453e-05, + "loss": 0.3354, + "step": 651, + "task_loss": 0.9715918898582458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24291005730628967, + "epoch": 0.55, + "learning_rate": 4.60632773819587e-05, + "loss": 0.3064, + "step": 652, + "task_loss": 0.5996285080909729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24056079983711243, + "epoch": 0.55, + "learning_rate": 4.6057239463832874e-05, + "loss": 0.4083, + "step": 653, + "task_loss": 0.1843840479850769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3044561743736267, + "epoch": 0.55, + "learning_rate": 4.605120154570704e-05, + "loss": 0.3245, + "step": 654, + "task_loss": 0.6563989520072937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5701591968536377, + "epoch": 0.55, + "learning_rate": 4.6045163627581215e-05, + "loss": 0.3578, + "step": 655, + "task_loss": 0.16813500225543976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26367518305778503, + "epoch": 0.55, + "learning_rate": 4.603912570945538e-05, + "loss": 0.4461, + "step": 656, + "task_loss": 0.5315459370613098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3030101954936981, + "epoch": 0.56, + "learning_rate": 4.6033087791329556e-05, + "loss": 0.344, + "step": 657, + "task_loss": 0.7940163016319275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23225125670433044, + "epoch": 0.56, + "learning_rate": 4.602704987320372e-05, + "loss": 0.3637, + "step": 658, + "task_loss": 0.5889531970024109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33130282163619995, + "epoch": 0.56, + "learning_rate": 4.602101195507789e-05, + "loss": 0.4226, + "step": 659, + "task_loss": 0.8525395393371582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20055104792118073, + "epoch": 0.56, + "learning_rate": 4.6014974036952064e-05, + "loss": 0.4213, + "step": 660, + "task_loss": 0.3496905565261841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5440434217453003, + "epoch": 0.56, + "learning_rate": 4.600893611882623e-05, + "loss": 0.448, + "step": 661, + "task_loss": 0.896485447883606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2198769897222519, + "epoch": 0.56, + "learning_rate": 4.60028982007004e-05, + "loss": 0.3428, + "step": 662, + "task_loss": 0.4718659818172455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45973771810531616, + "epoch": 0.56, + "learning_rate": 4.599686028257457e-05, + "loss": 0.3601, + "step": 663, + "task_loss": 0.14959101378917694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36007171869277954, + "epoch": 0.56, + "learning_rate": 4.599082236444874e-05, + "loss": 0.367, + "step": 664, + "task_loss": 0.1878933608531952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30151093006134033, + "epoch": 0.56, + "learning_rate": 4.598478444632291e-05, + "loss": 0.3279, + "step": 665, + "task_loss": 0.5680833458900452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44395750761032104, + "epoch": 0.56, + "learning_rate": 4.597874652819708e-05, + "loss": 0.3443, + "step": 666, + "task_loss": 0.38879168033599854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3097415566444397, + "epoch": 0.56, + "learning_rate": 4.5972708610071255e-05, + "loss": 0.3417, + "step": 667, + "task_loss": 0.5373480319976807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30059152841567993, + "epoch": 0.56, + "learning_rate": 4.5966670691945415e-05, + "loss": 0.3592, + "step": 668, + "task_loss": 0.645464301109314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2942499816417694, + "epoch": 0.57, + "learning_rate": 4.596063277381959e-05, + "loss": 0.3628, + "step": 669, + "task_loss": 0.9490315914154053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27622634172439575, + "epoch": 0.57, + "learning_rate": 4.595459485569376e-05, + "loss": 0.2631, + "step": 670, + "task_loss": 0.3641386330127716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27426183223724365, + "epoch": 0.57, + "learning_rate": 4.5948556937567924e-05, + "loss": 0.3301, + "step": 671, + "task_loss": 0.9640410542488098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33635398745536804, + "epoch": 0.57, + "learning_rate": 4.59425190194421e-05, + "loss": 0.4058, + "step": 672, + "task_loss": 0.6574383974075317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43457847833633423, + "epoch": 0.57, + "learning_rate": 4.593648110131627e-05, + "loss": 0.2719, + "step": 673, + "task_loss": 1.1977999210357666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19407469034194946, + "epoch": 0.57, + "learning_rate": 4.593044318319044e-05, + "loss": 0.295, + "step": 674, + "task_loss": 0.307859867811203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39448654651641846, + "epoch": 0.57, + "learning_rate": 4.5924405265064606e-05, + "loss": 0.4649, + "step": 675, + "task_loss": 1.0258768796920776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2316320538520813, + "epoch": 0.57, + "learning_rate": 4.591836734693878e-05, + "loss": 0.4397, + "step": 676, + "task_loss": 0.6709340810775757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2632187008857727, + "epoch": 0.57, + "learning_rate": 4.591232942881295e-05, + "loss": 0.3157, + "step": 677, + "task_loss": 0.28071120381355286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3112484812736511, + "epoch": 0.57, + "learning_rate": 4.5906291510687114e-05, + "loss": 0.3235, + "step": 678, + "task_loss": 0.20828117430210114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3335568308830261, + "epoch": 0.57, + "learning_rate": 4.590025359256129e-05, + "loss": 0.3629, + "step": 679, + "task_loss": 0.5164096355438232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23583701252937317, + "epoch": 0.57, + "learning_rate": 4.589421567443546e-05, + "loss": 0.2623, + "step": 680, + "task_loss": 0.7877901196479797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2872389256954193, + "epoch": 0.58, + "learning_rate": 4.588817775630962e-05, + "loss": 0.3789, + "step": 681, + "task_loss": 0.12168869376182556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14790436625480652, + "epoch": 0.58, + "learning_rate": 4.5882139838183797e-05, + "loss": 0.3465, + "step": 682, + "task_loss": 0.34522756934165955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7403531670570374, + "epoch": 0.58, + "learning_rate": 4.587610192005797e-05, + "loss": 0.4842, + "step": 683, + "task_loss": 0.6939802765846252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23989468812942505, + "epoch": 0.58, + "learning_rate": 4.587006400193213e-05, + "loss": 0.2582, + "step": 684, + "task_loss": 0.6449297070503235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20592886209487915, + "epoch": 0.58, + "learning_rate": 4.5864026083806305e-05, + "loss": 0.3697, + "step": 685, + "task_loss": 0.3713332414627075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3903454840183258, + "epoch": 0.58, + "learning_rate": 4.585798816568048e-05, + "loss": 0.2636, + "step": 686, + "task_loss": 0.19788767397403717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.336739718914032, + "epoch": 0.58, + "learning_rate": 4.5851950247554646e-05, + "loss": 0.3095, + "step": 687, + "task_loss": 1.0469329357147217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.170437291264534, + "epoch": 0.58, + "learning_rate": 4.584591232942881e-05, + "loss": 0.2963, + "step": 688, + "task_loss": 0.7811930179595947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29482707381248474, + "epoch": 0.58, + "learning_rate": 4.583987441130299e-05, + "loss": 0.3391, + "step": 689, + "task_loss": 0.28795668482780457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12704573571681976, + "epoch": 0.58, + "learning_rate": 4.5833836493177154e-05, + "loss": 0.2565, + "step": 690, + "task_loss": 0.26545649766921997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27358075976371765, + "epoch": 0.58, + "learning_rate": 4.582779857505132e-05, + "loss": 0.3741, + "step": 691, + "task_loss": 0.47030144929885864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40781766176223755, + "epoch": 0.58, + "learning_rate": 4.5821760656925496e-05, + "loss": 0.37, + "step": 692, + "task_loss": 0.934550404548645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2664041817188263, + "epoch": 0.59, + "learning_rate": 4.581572273879966e-05, + "loss": 0.5318, + "step": 693, + "task_loss": 1.5197149515151978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7974572777748108, + "epoch": 0.59, + "learning_rate": 4.580968482067383e-05, + "loss": 0.4612, + "step": 694, + "task_loss": 0.7451560497283936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4035872220993042, + "epoch": 0.59, + "learning_rate": 4.5803646902548004e-05, + "loss": 0.3658, + "step": 695, + "task_loss": 1.047471046447754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24698840081691742, + "epoch": 0.59, + "learning_rate": 4.579760898442218e-05, + "loss": 0.3036, + "step": 696, + "task_loss": 0.605748176574707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43346935510635376, + "epoch": 0.59, + "learning_rate": 4.5791571066296345e-05, + "loss": 0.3711, + "step": 697, + "task_loss": 0.8229206204414368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.438349187374115, + "epoch": 0.59, + "learning_rate": 4.578553314817051e-05, + "loss": 0.4623, + "step": 698, + "task_loss": 0.3380439877510071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31243640184402466, + "epoch": 0.59, + "learning_rate": 4.5779495230044686e-05, + "loss": 0.3853, + "step": 699, + "task_loss": 0.4344436824321747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46454307436943054, + "epoch": 0.59, + "learning_rate": 4.577345731191885e-05, + "loss": 0.444, + "step": 700, + "task_loss": 0.5624050498008728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18277767300605774, + "epoch": 0.59, + "learning_rate": 4.576741939379302e-05, + "loss": 0.4415, + "step": 701, + "task_loss": 0.45513394474983215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21187463402748108, + "epoch": 0.59, + "learning_rate": 4.5761381475667194e-05, + "loss": 0.2791, + "step": 702, + "task_loss": 0.6887025237083435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27018314599990845, + "epoch": 0.59, + "learning_rate": 4.575534355754136e-05, + "loss": 0.2991, + "step": 703, + "task_loss": 0.8194954991340637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30422303080558777, + "epoch": 0.59, + "learning_rate": 4.574930563941553e-05, + "loss": 0.3206, + "step": 704, + "task_loss": 1.6382495164871216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5321090221405029, + "epoch": 0.6, + "learning_rate": 4.57432677212897e-05, + "loss": 0.3869, + "step": 705, + "task_loss": 1.6544432640075684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2524600625038147, + "epoch": 0.6, + "learning_rate": 4.573722980316387e-05, + "loss": 0.3343, + "step": 706, + "task_loss": 0.36589089035987854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2441057562828064, + "epoch": 0.6, + "learning_rate": 4.573119188503804e-05, + "loss": 0.3764, + "step": 707, + "task_loss": 0.4601496160030365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2476460486650467, + "epoch": 0.6, + "learning_rate": 4.572515396691221e-05, + "loss": 0.292, + "step": 708, + "task_loss": 0.6016090512275696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21149423718452454, + "epoch": 0.6, + "learning_rate": 4.571911604878638e-05, + "loss": 0.5381, + "step": 709, + "task_loss": 0.2821265757083893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22485455870628357, + "epoch": 0.6, + "learning_rate": 4.571307813066055e-05, + "loss": 0.3785, + "step": 710, + "task_loss": 0.7016220092773438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2019926905632019, + "epoch": 0.6, + "learning_rate": 4.570704021253472e-05, + "loss": 0.3841, + "step": 711, + "task_loss": 1.3649837970733643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42618387937545776, + "epoch": 0.6, + "learning_rate": 4.570100229440889e-05, + "loss": 0.455, + "step": 712, + "task_loss": 0.8006240129470825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22870805859565735, + "epoch": 0.6, + "learning_rate": 4.569496437628306e-05, + "loss": 0.2952, + "step": 713, + "task_loss": 0.1706569790840149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33901914954185486, + "epoch": 0.6, + "learning_rate": 4.568892645815723e-05, + "loss": 0.3755, + "step": 714, + "task_loss": 0.10086208581924438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19824597239494324, + "epoch": 0.6, + "learning_rate": 4.56828885400314e-05, + "loss": 0.2703, + "step": 715, + "task_loss": 0.4939492642879486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29819124937057495, + "epoch": 0.6, + "learning_rate": 4.567685062190557e-05, + "loss": 0.3263, + "step": 716, + "task_loss": 0.9665405750274658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24669227004051208, + "epoch": 0.61, + "learning_rate": 4.5670812703779736e-05, + "loss": 0.4604, + "step": 717, + "task_loss": 0.7395060658454895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15106871724128723, + "epoch": 0.61, + "learning_rate": 4.566477478565391e-05, + "loss": 0.3035, + "step": 718, + "task_loss": 0.26799696683883667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22672738134860992, + "epoch": 0.61, + "learning_rate": 4.565873686752808e-05, + "loss": 0.305, + "step": 719, + "task_loss": 0.8576157093048096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5736232995986938, + "epoch": 0.61, + "learning_rate": 4.565269894940225e-05, + "loss": 0.517, + "step": 720, + "task_loss": 1.0136209726333618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40861254930496216, + "epoch": 0.61, + "learning_rate": 4.564666103127642e-05, + "loss": 0.3475, + "step": 721, + "task_loss": 0.8161322474479675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44145435094833374, + "epoch": 0.61, + "learning_rate": 4.5640623113150586e-05, + "loss": 0.3303, + "step": 722, + "task_loss": 0.6067239046096802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14312711358070374, + "epoch": 0.61, + "learning_rate": 4.563458519502476e-05, + "loss": 0.3156, + "step": 723, + "task_loss": 0.37298744916915894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44245588779449463, + "epoch": 0.61, + "learning_rate": 4.562854727689893e-05, + "loss": 0.4077, + "step": 724, + "task_loss": 0.7636491656303406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6131328344345093, + "epoch": 0.61, + "learning_rate": 4.5622509358773094e-05, + "loss": 0.3716, + "step": 725, + "task_loss": 1.061371922492981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3987913131713867, + "epoch": 0.61, + "learning_rate": 4.561647144064727e-05, + "loss": 0.4169, + "step": 726, + "task_loss": 0.6444965600967407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37525853514671326, + "epoch": 0.61, + "learning_rate": 4.5610433522521435e-05, + "loss": 0.3535, + "step": 727, + "task_loss": 0.5094666481018066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23235808312892914, + "epoch": 0.61, + "learning_rate": 4.56043956043956e-05, + "loss": 0.3431, + "step": 728, + "task_loss": 0.591149091720581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16249266266822815, + "epoch": 0.62, + "learning_rate": 4.5598357686269776e-05, + "loss": 0.3705, + "step": 729, + "task_loss": 0.6060795783996582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20443367958068848, + "epoch": 0.62, + "learning_rate": 4.559231976814395e-05, + "loss": 0.3387, + "step": 730, + "task_loss": 0.3445514440536499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2696666121482849, + "epoch": 0.62, + "learning_rate": 4.558628185001812e-05, + "loss": 0.3281, + "step": 731, + "task_loss": 0.7198132872581482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28837472200393677, + "epoch": 0.62, + "learning_rate": 4.5580243931892284e-05, + "loss": 0.3746, + "step": 732, + "task_loss": 1.1804611682891846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3295768201351166, + "epoch": 0.62, + "learning_rate": 4.557420601376646e-05, + "loss": 0.4039, + "step": 733, + "task_loss": 1.3205760717391968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27145543694496155, + "epoch": 0.62, + "learning_rate": 4.5568168095640626e-05, + "loss": 0.3069, + "step": 734, + "task_loss": 0.5923184156417847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3135417103767395, + "epoch": 0.62, + "learning_rate": 4.556213017751479e-05, + "loss": 0.4086, + "step": 735, + "task_loss": 0.43117162585258484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28134819865226746, + "epoch": 0.62, + "learning_rate": 4.555609225938897e-05, + "loss": 0.3104, + "step": 736, + "task_loss": 0.49730628728866577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4486047625541687, + "epoch": 0.62, + "learning_rate": 4.5550054341263134e-05, + "loss": 0.3291, + "step": 737, + "task_loss": 0.15193206071853638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22780045866966248, + "epoch": 0.62, + "learning_rate": 4.55440164231373e-05, + "loss": 0.3593, + "step": 738, + "task_loss": 0.16805656254291534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2513253092765808, + "epoch": 0.62, + "learning_rate": 4.5537978505011475e-05, + "loss": 0.3323, + "step": 739, + "task_loss": 0.5934991240501404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22833995521068573, + "epoch": 0.63, + "learning_rate": 4.553194058688565e-05, + "loss": 0.2774, + "step": 740, + "task_loss": 0.618530809879303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3058266043663025, + "epoch": 0.63, + "learning_rate": 4.552590266875981e-05, + "loss": 0.3478, + "step": 741, + "task_loss": 0.046308696269989014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29397332668304443, + "epoch": 0.63, + "learning_rate": 4.5519864750633983e-05, + "loss": 0.324, + "step": 742, + "task_loss": 0.47769293189048767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37344565987586975, + "epoch": 0.63, + "learning_rate": 4.551382683250816e-05, + "loss": 0.3952, + "step": 743, + "task_loss": 1.2599339485168457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3123040795326233, + "epoch": 0.63, + "learning_rate": 4.550778891438232e-05, + "loss": 0.3517, + "step": 744, + "task_loss": 1.3710837364196777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4613143801689148, + "epoch": 0.63, + "learning_rate": 4.550175099625649e-05, + "loss": 0.3388, + "step": 745, + "task_loss": 0.3595305383205414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19071230292320251, + "epoch": 0.63, + "learning_rate": 4.5495713078130666e-05, + "loss": 0.3515, + "step": 746, + "task_loss": 0.9351141452789307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1858353614807129, + "epoch": 0.63, + "learning_rate": 4.548967516000483e-05, + "loss": 0.2997, + "step": 747, + "task_loss": 0.2936874032020569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37073928117752075, + "epoch": 0.63, + "learning_rate": 4.5483637241879e-05, + "loss": 0.3776, + "step": 748, + "task_loss": 0.985735297203064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39584609866142273, + "epoch": 0.63, + "learning_rate": 4.5477599323753174e-05, + "loss": 0.3023, + "step": 749, + "task_loss": 0.2725343108177185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3656730651855469, + "epoch": 0.63, + "learning_rate": 4.547156140562734e-05, + "loss": 0.3882, + "step": 750, + "task_loss": 0.6669621467590332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4899216592311859, + "epoch": 0.63, + "learning_rate": 4.546552348750151e-05, + "loss": 0.3371, + "step": 751, + "task_loss": 0.7428222298622131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2465105801820755, + "epoch": 0.64, + "learning_rate": 4.545948556937568e-05, + "loss": 0.348, + "step": 752, + "task_loss": 0.40641671419143677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32330381870269775, + "epoch": 0.64, + "learning_rate": 4.5453447651249856e-05, + "loss": 0.2979, + "step": 753, + "task_loss": 1.0408015251159668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28674131631851196, + "epoch": 0.64, + "learning_rate": 4.544740973312402e-05, + "loss": 0.3006, + "step": 754, + "task_loss": 0.6484240889549255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2678007483482361, + "epoch": 0.64, + "learning_rate": 4.544137181499819e-05, + "loss": 0.3234, + "step": 755, + "task_loss": 0.6351743936538696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.12141424417495728, + "epoch": 0.64, + "learning_rate": 4.5435333896872365e-05, + "loss": 0.265, + "step": 756, + "task_loss": 0.5992752313613892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24598108232021332, + "epoch": 0.64, + "learning_rate": 4.5429295978746525e-05, + "loss": 0.4344, + "step": 757, + "task_loss": 0.36463993787765503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18556594848632812, + "epoch": 0.64, + "learning_rate": 4.54232580606207e-05, + "loss": 0.2636, + "step": 758, + "task_loss": 0.14679035544395447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15820741653442383, + "epoch": 0.64, + "learning_rate": 4.541722014249487e-05, + "loss": 0.2695, + "step": 759, + "task_loss": 0.013220874592661858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33770519495010376, + "epoch": 0.64, + "learning_rate": 4.541118222436904e-05, + "loss": 0.3761, + "step": 760, + "task_loss": 0.4439811110496521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18101146817207336, + "epoch": 0.64, + "learning_rate": 4.540514430624321e-05, + "loss": 0.3802, + "step": 761, + "task_loss": 0.502497136592865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24307934939861298, + "epoch": 0.64, + "learning_rate": 4.539910638811738e-05, + "loss": 0.2357, + "step": 762, + "task_loss": 1.0100958347320557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2827172875404358, + "epoch": 0.64, + "learning_rate": 4.539306846999155e-05, + "loss": 0.4196, + "step": 763, + "task_loss": 0.2085815668106079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3874979317188263, + "epoch": 0.65, + "learning_rate": 4.5387030551865716e-05, + "loss": 0.3727, + "step": 764, + "task_loss": 0.4113468825817108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34265443682670593, + "epoch": 0.65, + "learning_rate": 4.538099263373989e-05, + "loss": 0.3238, + "step": 765, + "task_loss": 0.29426100850105286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18266001343727112, + "epoch": 0.65, + "learning_rate": 4.537495471561406e-05, + "loss": 0.3644, + "step": 766, + "task_loss": 0.4295598864555359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25349992513656616, + "epoch": 0.65, + "learning_rate": 4.5368916797488224e-05, + "loss": 0.2717, + "step": 767, + "task_loss": 0.7366878986358643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2427140176296234, + "epoch": 0.65, + "learning_rate": 4.53628788793624e-05, + "loss": 0.3033, + "step": 768, + "task_loss": 0.2117612659931183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31193143129348755, + "epoch": 0.65, + "learning_rate": 4.535684096123657e-05, + "loss": 0.3336, + "step": 769, + "task_loss": 1.5904284715652466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3614553213119507, + "epoch": 0.65, + "learning_rate": 4.535080304311074e-05, + "loss": 0.373, + "step": 770, + "task_loss": 0.5245720744132996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23396113514900208, + "epoch": 0.65, + "learning_rate": 4.5344765124984906e-05, + "loss": 0.3966, + "step": 771, + "task_loss": 0.3384198546409607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2826487123966217, + "epoch": 0.65, + "learning_rate": 4.533872720685908e-05, + "loss": 0.3482, + "step": 772, + "task_loss": 0.8049398064613342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2899158000946045, + "epoch": 0.65, + "learning_rate": 4.533268928873325e-05, + "loss": 0.3689, + "step": 773, + "task_loss": 0.32007095217704773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34210312366485596, + "epoch": 0.65, + "learning_rate": 4.5326651370607415e-05, + "loss": 0.3304, + "step": 774, + "task_loss": 1.3622010946273804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23519110679626465, + "epoch": 0.65, + "learning_rate": 4.532061345248159e-05, + "loss": 0.2949, + "step": 775, + "task_loss": 0.9312596917152405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.351754367351532, + "epoch": 0.66, + "learning_rate": 4.5314575534355756e-05, + "loss": 0.2727, + "step": 776, + "task_loss": 0.5468899011611938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.180947408080101, + "epoch": 0.66, + "learning_rate": 4.530853761622992e-05, + "loss": 0.2183, + "step": 777, + "task_loss": 0.0869239792227745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4783400297164917, + "epoch": 0.66, + "learning_rate": 4.53024996981041e-05, + "loss": 0.3296, + "step": 778, + "task_loss": 0.6275749206542969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3339013159275055, + "epoch": 0.66, + "learning_rate": 4.5296461779978264e-05, + "loss": 0.3194, + "step": 779, + "task_loss": 0.4619302749633789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3331663906574249, + "epoch": 0.66, + "learning_rate": 4.529042386185244e-05, + "loss": 0.2648, + "step": 780, + "task_loss": 0.2219018042087555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18654146790504456, + "epoch": 0.66, + "learning_rate": 4.5284385943726605e-05, + "loss": 0.3174, + "step": 781, + "task_loss": 0.5415080785751343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2856631278991699, + "epoch": 0.66, + "learning_rate": 4.527834802560077e-05, + "loss": 0.339, + "step": 782, + "task_loss": 1.3013662099838257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22131982445716858, + "epoch": 0.66, + "learning_rate": 4.5272310107474946e-05, + "loss": 0.2747, + "step": 783, + "task_loss": 0.20025552809238434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3150436580181122, + "epoch": 0.66, + "learning_rate": 4.5266272189349114e-05, + "loss": 0.4531, + "step": 784, + "task_loss": 0.4217085838317871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3068576455116272, + "epoch": 0.66, + "learning_rate": 4.526023427122328e-05, + "loss": 0.3926, + "step": 785, + "task_loss": 0.8694249987602234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20825308561325073, + "epoch": 0.66, + "learning_rate": 4.5254196353097455e-05, + "loss": 0.4075, + "step": 786, + "task_loss": 0.0634683147072792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26817312836647034, + "epoch": 0.66, + "learning_rate": 4.524815843497162e-05, + "loss": 0.3672, + "step": 787, + "task_loss": 0.5024194121360779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26155024766921997, + "epoch": 0.67, + "learning_rate": 4.5242120516845796e-05, + "loss": 0.3012, + "step": 788, + "task_loss": 0.29033157229423523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24371114373207092, + "epoch": 0.67, + "learning_rate": 4.523608259871996e-05, + "loss": 0.3037, + "step": 789, + "task_loss": 0.4643123149871826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20182812213897705, + "epoch": 0.67, + "learning_rate": 4.523004468059414e-05, + "loss": 0.2759, + "step": 790, + "task_loss": 1.0971732139587402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2818538248538971, + "epoch": 0.67, + "learning_rate": 4.5224006762468304e-05, + "loss": 0.4353, + "step": 791, + "task_loss": 0.2589888572692871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27974045276641846, + "epoch": 0.67, + "learning_rate": 4.521796884434247e-05, + "loss": 0.3152, + "step": 792, + "task_loss": 0.9788235425949097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40452176332473755, + "epoch": 0.67, + "learning_rate": 4.5211930926216645e-05, + "loss": 0.3712, + "step": 793, + "task_loss": 0.7325109839439392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27447766065597534, + "epoch": 0.67, + "learning_rate": 4.520589300809081e-05, + "loss": 0.343, + "step": 794, + "task_loss": 0.6478328108787537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3494625687599182, + "epoch": 0.67, + "learning_rate": 4.519985508996498e-05, + "loss": 0.3927, + "step": 795, + "task_loss": 0.41533729434013367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18138387799263, + "epoch": 0.67, + "learning_rate": 4.5193817171839154e-05, + "loss": 0.2647, + "step": 796, + "task_loss": 0.48322778940200806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3385716378688812, + "epoch": 0.67, + "learning_rate": 4.518777925371332e-05, + "loss": 0.3245, + "step": 797, + "task_loss": 0.5529921054840088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45837950706481934, + "epoch": 0.67, + "learning_rate": 4.518174133558749e-05, + "loss": 0.3665, + "step": 798, + "task_loss": 1.170419454574585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.250112384557724, + "epoch": 0.67, + "learning_rate": 4.517570341746166e-05, + "loss": 0.4368, + "step": 799, + "task_loss": 0.6176565289497375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16649723052978516, + "epoch": 0.68, + "learning_rate": 4.5169665499335836e-05, + "loss": 0.3061, + "step": 800, + "task_loss": 0.5086045861244202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20179831981658936, + "epoch": 0.68, + "learning_rate": 4.5163627581209996e-05, + "loss": 0.2442, + "step": 801, + "task_loss": 0.09176841378211975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38554054498672485, + "epoch": 0.68, + "learning_rate": 4.515758966308417e-05, + "loss": 0.3646, + "step": 802, + "task_loss": 0.5932177305221558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30483534932136536, + "epoch": 0.68, + "learning_rate": 4.5151551744958344e-05, + "loss": 0.437, + "step": 803, + "task_loss": 0.9456331729888916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30818307399749756, + "epoch": 0.68, + "learning_rate": 4.514551382683251e-05, + "loss": 0.2894, + "step": 804, + "task_loss": 1.2488499879837036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3145252466201782, + "epoch": 0.68, + "learning_rate": 4.513947590870668e-05, + "loss": 0.3799, + "step": 805, + "task_loss": 0.7992182970046997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24541282653808594, + "epoch": 0.68, + "learning_rate": 4.513343799058085e-05, + "loss": 0.32, + "step": 806, + "task_loss": 0.22804608941078186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3519752621650696, + "epoch": 0.68, + "learning_rate": 4.512740007245502e-05, + "loss": 0.322, + "step": 807, + "task_loss": 0.25990596413612366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4632500410079956, + "epoch": 0.68, + "learning_rate": 4.512136215432919e-05, + "loss": 0.426, + "step": 808, + "task_loss": 0.9388142824172974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4405101239681244, + "epoch": 0.68, + "learning_rate": 4.511532423620336e-05, + "loss": 0.2835, + "step": 809, + "task_loss": 0.921435534954071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3464605510234833, + "epoch": 0.68, + "learning_rate": 4.5109286318077535e-05, + "loss": 0.2468, + "step": 810, + "task_loss": 0.9209409356117249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.10707838833332062, + "epoch": 0.69, + "learning_rate": 4.5103248399951695e-05, + "loss": 0.3704, + "step": 811, + "task_loss": 0.01258013118058443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16764624416828156, + "epoch": 0.69, + "learning_rate": 4.509721048182587e-05, + "loss": 0.2984, + "step": 812, + "task_loss": 0.0628896951675415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15210193395614624, + "epoch": 0.69, + "learning_rate": 4.509117256370004e-05, + "loss": 0.2296, + "step": 813, + "task_loss": 0.1468479484319687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41215944290161133, + "epoch": 0.69, + "learning_rate": 4.5085134645574204e-05, + "loss": 0.3346, + "step": 814, + "task_loss": 0.4282277226448059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2540086805820465, + "epoch": 0.69, + "learning_rate": 4.507909672744838e-05, + "loss": 0.4271, + "step": 815, + "task_loss": 0.043058332055807114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1628030389547348, + "epoch": 0.69, + "learning_rate": 4.507305880932255e-05, + "loss": 0.39, + "step": 816, + "task_loss": 0.16738420724868774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21274971961975098, + "epoch": 0.69, + "learning_rate": 4.506702089119671e-05, + "loss": 0.2684, + "step": 817, + "task_loss": 0.8701009750366211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.550127387046814, + "epoch": 0.69, + "learning_rate": 4.5060982973070886e-05, + "loss": 0.4048, + "step": 818, + "task_loss": 0.8876651525497437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31898829340934753, + "epoch": 0.69, + "learning_rate": 4.505494505494506e-05, + "loss": 0.3421, + "step": 819, + "task_loss": 0.6312325596809387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21779802441596985, + "epoch": 0.69, + "learning_rate": 4.504890713681923e-05, + "loss": 0.3156, + "step": 820, + "task_loss": 0.27031469345092773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18510892987251282, + "epoch": 0.69, + "learning_rate": 4.5042869218693394e-05, + "loss": 0.2895, + "step": 821, + "task_loss": 0.6171860694885254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21389347314834595, + "epoch": 0.69, + "learning_rate": 4.503683130056757e-05, + "loss": 0.2918, + "step": 822, + "task_loss": 0.8571591377258301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27689576148986816, + "epoch": 0.7, + "learning_rate": 4.5030793382441735e-05, + "loss": 0.3184, + "step": 823, + "task_loss": 0.3355971872806549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33154094219207764, + "epoch": 0.7, + "learning_rate": 4.50247554643159e-05, + "loss": 0.2995, + "step": 824, + "task_loss": 0.6020426750183105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21476851403713226, + "epoch": 0.7, + "learning_rate": 4.5018717546190076e-05, + "loss": 0.2518, + "step": 825, + "task_loss": 0.39440909028053284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20731791853904724, + "epoch": 0.7, + "learning_rate": 4.501267962806425e-05, + "loss": 0.2677, + "step": 826, + "task_loss": 0.5119296312332153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5100505948066711, + "epoch": 0.7, + "learning_rate": 4.500664170993841e-05, + "loss": 0.3259, + "step": 827, + "task_loss": 0.9436101913452148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5131642818450928, + "epoch": 0.7, + "learning_rate": 4.5000603791812585e-05, + "loss": 0.3358, + "step": 828, + "task_loss": 0.3923644423484802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3875989615917206, + "epoch": 0.7, + "learning_rate": 4.499456587368676e-05, + "loss": 0.4408, + "step": 829, + "task_loss": 0.5660097599029541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4496367275714874, + "epoch": 0.7, + "learning_rate": 4.4988527955560926e-05, + "loss": 0.3581, + "step": 830, + "task_loss": 0.41323772072792053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5072163939476013, + "epoch": 0.7, + "learning_rate": 4.498249003743509e-05, + "loss": 0.4033, + "step": 831, + "task_loss": 1.2710411548614502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1818779855966568, + "epoch": 0.7, + "learning_rate": 4.497645211930927e-05, + "loss": 0.2262, + "step": 832, + "task_loss": 0.4064873456954956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2969943583011627, + "epoch": 0.7, + "learning_rate": 4.4970414201183434e-05, + "loss": 0.3584, + "step": 833, + "task_loss": 1.0696207284927368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37884706258773804, + "epoch": 0.7, + "learning_rate": 4.49643762830576e-05, + "loss": 0.2946, + "step": 834, + "task_loss": 0.3618290424346924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.253044068813324, + "epoch": 0.71, + "learning_rate": 4.4958338364931775e-05, + "loss": 0.2577, + "step": 835, + "task_loss": 0.45334598422050476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4066677391529083, + "epoch": 0.71, + "learning_rate": 4.495230044680594e-05, + "loss": 0.3217, + "step": 836, + "task_loss": 0.9497126936912537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3431035280227661, + "epoch": 0.71, + "learning_rate": 4.494626252868011e-05, + "loss": 0.3464, + "step": 837, + "task_loss": 0.6192494034767151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2643086314201355, + "epoch": 0.71, + "learning_rate": 4.4940224610554284e-05, + "loss": 0.3515, + "step": 838, + "task_loss": 1.0006438493728638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5359170436859131, + "epoch": 0.71, + "learning_rate": 4.493418669242845e-05, + "loss": 0.3932, + "step": 839, + "task_loss": 0.17423032224178314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45239830017089844, + "epoch": 0.71, + "learning_rate": 4.4928148774302625e-05, + "loss": 0.3296, + "step": 840, + "task_loss": 0.8025171160697937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3105016052722931, + "epoch": 0.71, + "learning_rate": 4.492211085617679e-05, + "loss": 0.4161, + "step": 841, + "task_loss": 0.6803241968154907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6275674104690552, + "epoch": 0.71, + "learning_rate": 4.4916072938050966e-05, + "loss": 0.4825, + "step": 842, + "task_loss": 0.6989018321037292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24314916133880615, + "epoch": 0.71, + "learning_rate": 4.491003501992513e-05, + "loss": 0.4014, + "step": 843, + "task_loss": 0.1815427541732788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21715101599693298, + "epoch": 0.71, + "learning_rate": 4.49039971017993e-05, + "loss": 0.2853, + "step": 844, + "task_loss": 0.0939592570066452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43083101511001587, + "epoch": 0.71, + "learning_rate": 4.4897959183673474e-05, + "loss": 0.3325, + "step": 845, + "task_loss": 0.3811054825782776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46831777691841125, + "epoch": 0.71, + "learning_rate": 4.489192126554764e-05, + "loss": 0.446, + "step": 846, + "task_loss": 0.32218414545059204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1746586412191391, + "epoch": 0.72, + "learning_rate": 4.488588334742181e-05, + "loss": 0.2254, + "step": 847, + "task_loss": 1.381054162979126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18915149569511414, + "epoch": 0.72, + "learning_rate": 4.487984542929598e-05, + "loss": 0.3864, + "step": 848, + "task_loss": 1.0456067323684692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39994895458221436, + "epoch": 0.72, + "learning_rate": 4.487380751117015e-05, + "loss": 0.3548, + "step": 849, + "task_loss": 0.6537964344024658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3476526141166687, + "epoch": 0.72, + "learning_rate": 4.4867769593044324e-05, + "loss": 0.3429, + "step": 850, + "task_loss": 1.0994079113006592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3505191504955292, + "epoch": 0.72, + "learning_rate": 4.486173167491849e-05, + "loss": 0.4278, + "step": 851, + "task_loss": 0.9853001832962036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24659569561481476, + "epoch": 0.72, + "learning_rate": 4.485569375679266e-05, + "loss": 0.3423, + "step": 852, + "task_loss": 0.3349575698375702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19954511523246765, + "epoch": 0.72, + "learning_rate": 4.484965583866683e-05, + "loss": 0.3044, + "step": 853, + "task_loss": 0.3239381015300751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3386889100074768, + "epoch": 0.72, + "learning_rate": 4.4843617920541e-05, + "loss": 0.309, + "step": 854, + "task_loss": 0.2249189168214798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2926265597343445, + "epoch": 0.72, + "learning_rate": 4.4837580002415166e-05, + "loss": 0.2904, + "step": 855, + "task_loss": 0.6071061491966248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2602430582046509, + "epoch": 0.72, + "learning_rate": 4.483154208428934e-05, + "loss": 0.2715, + "step": 856, + "task_loss": 0.07972946017980576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2713681757450104, + "epoch": 0.72, + "learning_rate": 4.482550416616351e-05, + "loss": 0.4366, + "step": 857, + "task_loss": 0.47239094972610474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43284672498703003, + "epoch": 0.72, + "learning_rate": 4.4819466248037675e-05, + "loss": 0.3625, + "step": 858, + "task_loss": 1.0727410316467285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3162996172904968, + "epoch": 0.73, + "learning_rate": 4.481342832991185e-05, + "loss": 0.2562, + "step": 859, + "task_loss": 1.070422887802124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24605605006217957, + "epoch": 0.73, + "learning_rate": 4.480739041178602e-05, + "loss": 0.237, + "step": 860, + "task_loss": 0.3509301543235779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5116292834281921, + "epoch": 0.73, + "learning_rate": 4.480135249366019e-05, + "loss": 0.412, + "step": 861, + "task_loss": 0.17709815502166748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26727744936943054, + "epoch": 0.73, + "learning_rate": 4.479531457553436e-05, + "loss": 0.3207, + "step": 862, + "task_loss": 0.6446810364723206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5006260871887207, + "epoch": 0.73, + "learning_rate": 4.478927665740853e-05, + "loss": 0.3148, + "step": 863, + "task_loss": 0.21939678490161896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34987127780914307, + "epoch": 0.73, + "learning_rate": 4.47832387392827e-05, + "loss": 0.3455, + "step": 864, + "task_loss": 0.5132679343223572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30705782771110535, + "epoch": 0.73, + "learning_rate": 4.4777200821156865e-05, + "loss": 0.3065, + "step": 865, + "task_loss": 0.9149826765060425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.11884512007236481, + "epoch": 0.73, + "learning_rate": 4.477116290303104e-05, + "loss": 0.2348, + "step": 866, + "task_loss": 0.27015990018844604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1960749626159668, + "epoch": 0.73, + "learning_rate": 4.4765124984905207e-05, + "loss": 0.3135, + "step": 867, + "task_loss": 1.0727053880691528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15920498967170715, + "epoch": 0.73, + "learning_rate": 4.4759087066779374e-05, + "loss": 0.331, + "step": 868, + "task_loss": 0.10256748646497726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3017585575580597, + "epoch": 0.73, + "learning_rate": 4.475304914865355e-05, + "loss": 0.3729, + "step": 869, + "task_loss": 0.6431319713592529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17750011384487152, + "epoch": 0.73, + "learning_rate": 4.474701123052772e-05, + "loss": 0.3423, + "step": 870, + "task_loss": 0.4733154773712158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.164194256067276, + "epoch": 0.74, + "learning_rate": 4.474097331240188e-05, + "loss": 0.2685, + "step": 871, + "task_loss": 0.9117984175682068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42855706810951233, + "epoch": 0.74, + "learning_rate": 4.4734935394276056e-05, + "loss": 0.3357, + "step": 872, + "task_loss": 0.5845022797584534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.11093800514936447, + "epoch": 0.74, + "learning_rate": 4.472889747615023e-05, + "loss": 0.2113, + "step": 873, + "task_loss": 0.4649612307548523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1649121344089508, + "epoch": 0.74, + "learning_rate": 4.472285955802439e-05, + "loss": 0.2881, + "step": 874, + "task_loss": 0.48063600063323975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2889198064804077, + "epoch": 0.74, + "learning_rate": 4.4716821639898564e-05, + "loss": 0.2923, + "step": 875, + "task_loss": 0.600864589214325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46347254514694214, + "epoch": 0.74, + "learning_rate": 4.471078372177274e-05, + "loss": 0.3385, + "step": 876, + "task_loss": 0.373192697763443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1478535234928131, + "epoch": 0.74, + "learning_rate": 4.4704745803646905e-05, + "loss": 0.2494, + "step": 877, + "task_loss": 0.7011318802833557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2691228985786438, + "epoch": 0.74, + "learning_rate": 4.469870788552107e-05, + "loss": 0.2829, + "step": 878, + "task_loss": 0.6395071148872375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40578508377075195, + "epoch": 0.74, + "learning_rate": 4.469266996739525e-05, + "loss": 0.333, + "step": 879, + "task_loss": 0.5234085321426392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42064720392227173, + "epoch": 0.74, + "learning_rate": 4.4686632049269414e-05, + "loss": 0.3405, + "step": 880, + "task_loss": 0.7868759632110596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30395185947418213, + "epoch": 0.74, + "learning_rate": 4.468059413114358e-05, + "loss": 0.2518, + "step": 881, + "task_loss": 0.2462514191865921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.13588252663612366, + "epoch": 0.75, + "learning_rate": 4.4674556213017755e-05, + "loss": 0.3023, + "step": 882, + "task_loss": 0.1645929515361786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19684652984142303, + "epoch": 0.75, + "learning_rate": 4.466851829489193e-05, + "loss": 0.2963, + "step": 883, + "task_loss": 0.696341872215271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3418586254119873, + "epoch": 0.75, + "learning_rate": 4.466248037676609e-05, + "loss": 0.3042, + "step": 884, + "task_loss": 0.3610026240348816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14820131659507751, + "epoch": 0.75, + "learning_rate": 4.465644245864026e-05, + "loss": 0.2708, + "step": 885, + "task_loss": 1.6670372486114502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2551877200603485, + "epoch": 0.75, + "learning_rate": 4.465040454051444e-05, + "loss": 0.2551, + "step": 886, + "task_loss": 0.7116249203681946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32418292760849, + "epoch": 0.75, + "learning_rate": 4.46443666223886e-05, + "loss": 0.3433, + "step": 887, + "task_loss": 0.6139531135559082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25696778297424316, + "epoch": 0.75, + "learning_rate": 4.463832870426277e-05, + "loss": 0.2953, + "step": 888, + "task_loss": 0.4976051449775696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28035008907318115, + "epoch": 0.75, + "learning_rate": 4.4632290786136946e-05, + "loss": 0.3449, + "step": 889, + "task_loss": 0.6069408059120178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2542031407356262, + "epoch": 0.75, + "learning_rate": 4.4626252868011106e-05, + "loss": 0.3238, + "step": 890, + "task_loss": 0.3564848303794861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30425703525543213, + "epoch": 0.75, + "learning_rate": 4.462021494988528e-05, + "loss": 0.2975, + "step": 891, + "task_loss": 1.2710225582122803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1999809294939041, + "epoch": 0.75, + "learning_rate": 4.4614177031759454e-05, + "loss": 0.2465, + "step": 892, + "task_loss": 0.4991018772125244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20470239222049713, + "epoch": 0.75, + "learning_rate": 4.460813911363362e-05, + "loss": 0.3433, + "step": 893, + "task_loss": 0.4117075502872467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25381970405578613, + "epoch": 0.76, + "learning_rate": 4.460210119550779e-05, + "loss": 0.3004, + "step": 894, + "task_loss": 1.39918851852417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21731606125831604, + "epoch": 0.76, + "learning_rate": 4.459606327738196e-05, + "loss": 0.304, + "step": 895, + "task_loss": 0.1530483216047287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23831906914710999, + "epoch": 0.76, + "learning_rate": 4.459002535925613e-05, + "loss": 0.3481, + "step": 896, + "task_loss": 0.3927173614501953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23949450254440308, + "epoch": 0.76, + "learning_rate": 4.4583987441130297e-05, + "loss": 0.3068, + "step": 897, + "task_loss": 1.2088288068771362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30444851517677307, + "epoch": 0.76, + "learning_rate": 4.457794952300447e-05, + "loss": 0.3504, + "step": 898, + "task_loss": 0.2169787436723709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32510697841644287, + "epoch": 0.76, + "learning_rate": 4.4571911604878644e-05, + "loss": 0.3373, + "step": 899, + "task_loss": 1.149895191192627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17038384079933167, + "epoch": 0.76, + "learning_rate": 4.4565873686752805e-05, + "loss": 0.314, + "step": 900, + "task_loss": 0.24956150352954865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1839144229888916, + "epoch": 0.76, + "learning_rate": 4.455983576862698e-05, + "loss": 0.3675, + "step": 901, + "task_loss": 1.0437637567520142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46756502985954285, + "epoch": 0.76, + "learning_rate": 4.455379785050115e-05, + "loss": 0.344, + "step": 902, + "task_loss": 0.43766912817955017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2931916117668152, + "epoch": 0.76, + "learning_rate": 4.454775993237532e-05, + "loss": 0.3348, + "step": 903, + "task_loss": 0.8494141697883606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5187391042709351, + "epoch": 0.76, + "learning_rate": 4.454172201424949e-05, + "loss": 0.4149, + "step": 904, + "task_loss": 1.1341482400894165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5336723327636719, + "epoch": 0.76, + "learning_rate": 4.453568409612366e-05, + "loss": 0.283, + "step": 905, + "task_loss": 0.9153821468353271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2927427887916565, + "epoch": 0.77, + "learning_rate": 4.452964617799783e-05, + "loss": 0.3614, + "step": 906, + "task_loss": 1.150106430053711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5022847652435303, + "epoch": 0.77, + "learning_rate": 4.4523608259871996e-05, + "loss": 0.3949, + "step": 907, + "task_loss": 0.5168099999427795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4198801517486572, + "epoch": 0.77, + "learning_rate": 4.451757034174617e-05, + "loss": 0.3579, + "step": 908, + "task_loss": 1.7082383632659912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3057956397533417, + "epoch": 0.77, + "learning_rate": 4.451153242362034e-05, + "loss": 0.3653, + "step": 909, + "task_loss": 0.6539680361747742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2797132134437561, + "epoch": 0.77, + "learning_rate": 4.4505494505494504e-05, + "loss": 0.3337, + "step": 910, + "task_loss": 0.6889963150024414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29004788398742676, + "epoch": 0.77, + "learning_rate": 4.449945658736868e-05, + "loss": 0.2934, + "step": 911, + "task_loss": 0.1668662130832672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30916184186935425, + "epoch": 0.77, + "learning_rate": 4.4493418669242845e-05, + "loss": 0.3195, + "step": 912, + "task_loss": 0.4844413995742798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48047077655792236, + "epoch": 0.77, + "learning_rate": 4.448738075111702e-05, + "loss": 0.3777, + "step": 913, + "task_loss": 1.37309992313385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2226959466934204, + "epoch": 0.77, + "learning_rate": 4.4481342832991186e-05, + "loss": 0.3126, + "step": 914, + "task_loss": 1.0729975700378418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33201831579208374, + "epoch": 0.77, + "learning_rate": 4.447530491486535e-05, + "loss": 0.2802, + "step": 915, + "task_loss": 0.8840923309326172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17303964495658875, + "epoch": 0.77, + "learning_rate": 4.446926699673953e-05, + "loss": 0.2859, + "step": 916, + "task_loss": 0.5963447093963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27506813406944275, + "epoch": 0.77, + "learning_rate": 4.4463229078613694e-05, + "loss": 0.3101, + "step": 917, + "task_loss": 1.2725977897644043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3568727970123291, + "epoch": 0.78, + "learning_rate": 4.445719116048787e-05, + "loss": 0.3362, + "step": 918, + "task_loss": 0.5199917554855347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2657959461212158, + "epoch": 0.78, + "learning_rate": 4.4451153242362036e-05, + "loss": 0.3018, + "step": 919, + "task_loss": 1.0269718170166016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5243659019470215, + "epoch": 0.78, + "learning_rate": 4.44451153242362e-05, + "loss": 0.3201, + "step": 920, + "task_loss": 1.1536678075790405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2664206027984619, + "epoch": 0.78, + "learning_rate": 4.443907740611038e-05, + "loss": 0.2609, + "step": 921, + "task_loss": 0.5250079035758972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2522469460964203, + "epoch": 0.78, + "learning_rate": 4.4433039487984544e-05, + "loss": 0.3, + "step": 922, + "task_loss": 0.3302900791168213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19866514205932617, + "epoch": 0.78, + "learning_rate": 4.442700156985872e-05, + "loss": 0.3272, + "step": 923, + "task_loss": 1.0624237060546875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19574545323848724, + "epoch": 0.78, + "learning_rate": 4.4420963651732885e-05, + "loss": 0.3473, + "step": 924, + "task_loss": 1.3533955812454224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5235202312469482, + "epoch": 0.78, + "learning_rate": 4.441492573360705e-05, + "loss": 0.4736, + "step": 925, + "task_loss": 1.3145147562026978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23668259382247925, + "epoch": 0.78, + "learning_rate": 4.4408887815481226e-05, + "loss": 0.2459, + "step": 926, + "task_loss": 0.5487642884254456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36069899797439575, + "epoch": 0.78, + "learning_rate": 4.440284989735539e-05, + "loss": 0.2124, + "step": 927, + "task_loss": 0.13465382158756256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4356892704963684, + "epoch": 0.78, + "learning_rate": 4.439681197922956e-05, + "loss": 0.4098, + "step": 928, + "task_loss": 0.3588753342628479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5197922587394714, + "epoch": 0.78, + "learning_rate": 4.4390774061103735e-05, + "loss": 0.3846, + "step": 929, + "task_loss": 0.8379908800125122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2317069172859192, + "epoch": 0.79, + "learning_rate": 4.43847361429779e-05, + "loss": 0.3911, + "step": 930, + "task_loss": 1.5306708812713623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30233681201934814, + "epoch": 0.79, + "learning_rate": 4.437869822485207e-05, + "loss": 0.3162, + "step": 931, + "task_loss": 0.8812251687049866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16705961525440216, + "epoch": 0.79, + "learning_rate": 4.437266030672624e-05, + "loss": 0.2586, + "step": 932, + "task_loss": 0.20003189146518707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4740631878376007, + "epoch": 0.79, + "learning_rate": 4.436662238860042e-05, + "loss": 0.3369, + "step": 933, + "task_loss": 1.3663749694824219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.375772088766098, + "epoch": 0.79, + "learning_rate": 4.4360584470474584e-05, + "loss": 0.369, + "step": 934, + "task_loss": 0.583289384841919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4133703410625458, + "epoch": 0.79, + "learning_rate": 4.435454655234875e-05, + "loss": 0.3368, + "step": 935, + "task_loss": 0.3173907697200775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22911988198757172, + "epoch": 0.79, + "learning_rate": 4.4348508634222925e-05, + "loss": 0.3515, + "step": 936, + "task_loss": 0.13677309453487396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.196448415517807, + "epoch": 0.79, + "learning_rate": 4.434247071609709e-05, + "loss": 0.4182, + "step": 937, + "task_loss": 0.3450338840484619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41565096378326416, + "epoch": 0.79, + "learning_rate": 4.433643279797126e-05, + "loss": 0.403, + "step": 938, + "task_loss": 0.7415960431098938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.241883784532547, + "epoch": 0.79, + "learning_rate": 4.4330394879845433e-05, + "loss": 0.385, + "step": 939, + "task_loss": 0.30381277203559875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3282340168952942, + "epoch": 0.79, + "learning_rate": 4.43243569617196e-05, + "loss": 0.3275, + "step": 940, + "task_loss": 0.9947277903556824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3644707500934601, + "epoch": 0.79, + "learning_rate": 4.431831904359377e-05, + "loss": 0.2945, + "step": 941, + "task_loss": 0.7145079970359802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3223477303981781, + "epoch": 0.8, + "learning_rate": 4.431228112546794e-05, + "loss": 0.2993, + "step": 942, + "task_loss": 0.41398316621780396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3000563979148865, + "epoch": 0.8, + "learning_rate": 4.4306243207342116e-05, + "loss": 0.2881, + "step": 943, + "task_loss": 0.22363801300525665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3299402892589569, + "epoch": 0.8, + "learning_rate": 4.4300205289216276e-05, + "loss": 0.3253, + "step": 944, + "task_loss": 0.5077597498893738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2596122622489929, + "epoch": 0.8, + "learning_rate": 4.429416737109045e-05, + "loss": 0.3087, + "step": 945, + "task_loss": 0.423097163438797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25506094098091125, + "epoch": 0.8, + "learning_rate": 4.4288129452964624e-05, + "loss": 0.2447, + "step": 946, + "task_loss": 0.4598116874694824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27282679080963135, + "epoch": 0.8, + "learning_rate": 4.4282091534838784e-05, + "loss": 0.267, + "step": 947, + "task_loss": 0.40855786204338074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2465524524450302, + "epoch": 0.8, + "learning_rate": 4.427605361671296e-05, + "loss": 0.3157, + "step": 948, + "task_loss": 1.4026042222976685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2175057828426361, + "epoch": 0.8, + "learning_rate": 4.427001569858713e-05, + "loss": 0.3274, + "step": 949, + "task_loss": 0.42884060740470886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15002313256263733, + "epoch": 0.8, + "learning_rate": 4.42639777804613e-05, + "loss": 0.2565, + "step": 950, + "task_loss": 0.698137640953064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24660664796829224, + "epoch": 0.8, + "learning_rate": 4.425793986233547e-05, + "loss": 0.292, + "step": 951, + "task_loss": 0.5662290453910828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3446224331855774, + "epoch": 0.8, + "learning_rate": 4.425190194420964e-05, + "loss": 0.3067, + "step": 952, + "task_loss": 0.48581963777542114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22411227226257324, + "epoch": 0.81, + "learning_rate": 4.424586402608381e-05, + "loss": 0.3351, + "step": 953, + "task_loss": 0.3839857578277588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3203333020210266, + "epoch": 0.81, + "learning_rate": 4.4239826107957975e-05, + "loss": 0.3357, + "step": 954, + "task_loss": 0.9429383277893066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2740730345249176, + "epoch": 0.81, + "learning_rate": 4.423378818983215e-05, + "loss": 0.3464, + "step": 955, + "task_loss": 0.3803875744342804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18013200163841248, + "epoch": 0.81, + "learning_rate": 4.422775027170632e-05, + "loss": 0.2771, + "step": 956, + "task_loss": 0.037426676601171494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2157067507505417, + "epoch": 0.81, + "learning_rate": 4.4221712353580483e-05, + "loss": 0.2912, + "step": 957, + "task_loss": 0.7635881900787354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38518476486206055, + "epoch": 0.81, + "learning_rate": 4.421567443545466e-05, + "loss": 0.3426, + "step": 958, + "task_loss": 0.7743237018585205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25463634729385376, + "epoch": 0.81, + "learning_rate": 4.420963651732883e-05, + "loss": 0.4342, + "step": 959, + "task_loss": 0.4945194721221924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36092615127563477, + "epoch": 0.81, + "learning_rate": 4.420359859920299e-05, + "loss": 0.3765, + "step": 960, + "task_loss": 0.5233352184295654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2952161431312561, + "epoch": 0.81, + "learning_rate": 4.4197560681077166e-05, + "loss": 0.3502, + "step": 961, + "task_loss": 0.05033006891608238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3725583553314209, + "epoch": 0.81, + "learning_rate": 4.419152276295134e-05, + "loss": 0.3647, + "step": 962, + "task_loss": 0.49692144989967346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2846097946166992, + "epoch": 0.81, + "learning_rate": 4.418548484482551e-05, + "loss": 0.4061, + "step": 963, + "task_loss": 1.0440422296524048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27408260107040405, + "epoch": 0.81, + "learning_rate": 4.4179446926699674e-05, + "loss": 0.3303, + "step": 964, + "task_loss": 0.1104651391506195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39300885796546936, + "epoch": 0.82, + "learning_rate": 4.417340900857385e-05, + "loss": 0.3554, + "step": 965, + "task_loss": 0.3712834417819977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3396458923816681, + "epoch": 0.82, + "learning_rate": 4.4167371090448015e-05, + "loss": 0.3575, + "step": 966, + "task_loss": 1.6036909818649292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2086452692747116, + "epoch": 0.82, + "learning_rate": 4.416133317232218e-05, + "loss": 0.3655, + "step": 967, + "task_loss": 0.4195214807987213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.15584062039852142, + "epoch": 0.82, + "learning_rate": 4.4155295254196356e-05, + "loss": 0.304, + "step": 968, + "task_loss": 0.32137513160705566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3015002906322479, + "epoch": 0.82, + "learning_rate": 4.4149257336070523e-05, + "loss": 0.3427, + "step": 969, + "task_loss": 1.1579301357269287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2543022632598877, + "epoch": 0.82, + "learning_rate": 4.414321941794469e-05, + "loss": 0.285, + "step": 970, + "task_loss": 0.47699981927871704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2944380044937134, + "epoch": 0.82, + "learning_rate": 4.4137181499818865e-05, + "loss": 0.358, + "step": 971, + "task_loss": 0.5717900395393372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26408302783966064, + "epoch": 0.82, + "learning_rate": 4.413114358169304e-05, + "loss": 0.2635, + "step": 972, + "task_loss": 0.1646224409341812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4564114212989807, + "epoch": 0.82, + "learning_rate": 4.4125105663567206e-05, + "loss": 0.3039, + "step": 973, + "task_loss": 0.5078140497207642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20683130621910095, + "epoch": 0.82, + "learning_rate": 4.411906774544137e-05, + "loss": 0.316, + "step": 974, + "task_loss": 0.19483999907970428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2430845946073532, + "epoch": 0.82, + "learning_rate": 4.411302982731555e-05, + "loss": 0.3225, + "step": 975, + "task_loss": 0.42443209886550903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21450918912887573, + "epoch": 0.82, + "learning_rate": 4.4106991909189714e-05, + "loss": 0.4208, + "step": 976, + "task_loss": 0.5877259373664856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3073863387107849, + "epoch": 0.83, + "learning_rate": 4.410095399106388e-05, + "loss": 0.3333, + "step": 977, + "task_loss": 1.2696553468704224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21301056444644928, + "epoch": 0.83, + "learning_rate": 4.4094916072938055e-05, + "loss": 0.3137, + "step": 978, + "task_loss": 0.5221570730209351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3419423997402191, + "epoch": 0.83, + "learning_rate": 4.408887815481222e-05, + "loss": 0.3735, + "step": 979, + "task_loss": 0.10681381821632385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2353341281414032, + "epoch": 0.83, + "learning_rate": 4.408284023668639e-05, + "loss": 0.2944, + "step": 980, + "task_loss": 1.1870335340499878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2248719334602356, + "epoch": 0.83, + "learning_rate": 4.4076802318560564e-05, + "loss": 0.3282, + "step": 981, + "task_loss": 1.460014820098877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45160430669784546, + "epoch": 0.83, + "learning_rate": 4.407076440043473e-05, + "loss": 0.3221, + "step": 982, + "task_loss": 0.32142579555511475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34242257475852966, + "epoch": 0.83, + "learning_rate": 4.4064726482308905e-05, + "loss": 0.2983, + "step": 983, + "task_loss": 0.5037776827812195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1728317141532898, + "epoch": 0.83, + "learning_rate": 4.405868856418307e-05, + "loss": 0.4662, + "step": 984, + "task_loss": 0.9488698244094849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2451724410057068, + "epoch": 0.83, + "learning_rate": 4.405265064605724e-05, + "loss": 0.2674, + "step": 985, + "task_loss": 1.4671635627746582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3261406719684601, + "epoch": 0.83, + "learning_rate": 4.404661272793141e-05, + "loss": 0.317, + "step": 986, + "task_loss": 1.112964153289795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3129112124443054, + "epoch": 0.83, + "learning_rate": 4.404057480980558e-05, + "loss": 0.3609, + "step": 987, + "task_loss": 0.9495210647583008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2972010374069214, + "epoch": 0.83, + "learning_rate": 4.403453689167975e-05, + "loss": 0.3262, + "step": 988, + "task_loss": 0.5362968444824219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4374579191207886, + "epoch": 0.84, + "learning_rate": 4.402849897355392e-05, + "loss": 0.3438, + "step": 989, + "task_loss": 1.0089572668075562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33846473693847656, + "epoch": 0.84, + "learning_rate": 4.402246105542809e-05, + "loss": 0.2915, + "step": 990, + "task_loss": 1.0764498710632324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.220596581697464, + "epoch": 0.84, + "learning_rate": 4.401642313730226e-05, + "loss": 0.3072, + "step": 991, + "task_loss": 0.6565892696380615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32423368096351624, + "epoch": 0.84, + "learning_rate": 4.401038521917643e-05, + "loss": 0.2883, + "step": 992, + "task_loss": 1.0279080867767334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28588879108428955, + "epoch": 0.84, + "learning_rate": 4.4004347301050604e-05, + "loss": 0.2894, + "step": 993, + "task_loss": 0.20756539702415466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3498670160770416, + "epoch": 0.84, + "learning_rate": 4.399830938292477e-05, + "loss": 0.3086, + "step": 994, + "task_loss": 0.9307231903076172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38636162877082825, + "epoch": 0.84, + "learning_rate": 4.399227146479894e-05, + "loss": 0.3729, + "step": 995, + "task_loss": 0.3652908205986023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2958947420120239, + "epoch": 0.84, + "learning_rate": 4.398623354667311e-05, + "loss": 0.2632, + "step": 996, + "task_loss": 0.21556514501571655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2821813225746155, + "epoch": 0.84, + "learning_rate": 4.398019562854728e-05, + "loss": 0.3607, + "step": 997, + "task_loss": 0.9080299139022827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27574506402015686, + "epoch": 0.84, + "learning_rate": 4.3974157710421446e-05, + "loss": 0.349, + "step": 998, + "task_loss": 1.4849597215652466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32307714223861694, + "epoch": 0.84, + "learning_rate": 4.396811979229562e-05, + "loss": 0.3933, + "step": 999, + "task_loss": 1.157652497291565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4837101697921753, + "epoch": 0.84, + "learning_rate": 4.396208187416979e-05, + "loss": 0.3183, + "step": 1000, + "task_loss": 0.6930521130561829 + }, + { + "epoch": 0.84, + "eval_accuracy": 0.9144158415841585, + "eval_loss": 0.1866626739501953, + "eval_runtime": 338.8647, + "eval_samples_per_second": 74.514, + "eval_steps_per_second": 0.584, + "step": 1000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20211786031723022, + "epoch": 0.85, + "learning_rate": 4.3956043956043955e-05, + "loss": 0.2746, + "step": 1001, + "task_loss": 0.37245434522628784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14325755834579468, + "epoch": 0.85, + "learning_rate": 4.395000603791813e-05, + "loss": 0.2348, + "step": 1002, + "task_loss": 0.4337844252586365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1996465027332306, + "epoch": 0.85, + "learning_rate": 4.39439681197923e-05, + "loss": 0.3155, + "step": 1003, + "task_loss": 0.15168380737304688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29522231221199036, + "epoch": 0.85, + "learning_rate": 4.393793020166646e-05, + "loss": 0.4092, + "step": 1004, + "task_loss": 0.5470403432846069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9443536996841431, + "epoch": 0.85, + "learning_rate": 4.393189228354064e-05, + "loss": 0.4567, + "step": 1005, + "task_loss": 1.8830361366271973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49368518590927124, + "epoch": 0.85, + "learning_rate": 4.392585436541481e-05, + "loss": 0.3289, + "step": 1006, + "task_loss": 0.09269459545612335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20181068778038025, + "epoch": 0.85, + "learning_rate": 4.391981644728898e-05, + "loss": 0.2896, + "step": 1007, + "task_loss": 0.27710384130477905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3943747580051422, + "epoch": 0.85, + "learning_rate": 4.3913778529163145e-05, + "loss": 0.3982, + "step": 1008, + "task_loss": 0.7358403205871582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38803327083587646, + "epoch": 0.85, + "learning_rate": 4.390774061103732e-05, + "loss": 0.305, + "step": 1009, + "task_loss": 1.1953283548355103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3378750681877136, + "epoch": 0.85, + "learning_rate": 4.3901702692911486e-05, + "loss": 0.4178, + "step": 1010, + "task_loss": 1.432046890258789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5088300704956055, + "epoch": 0.85, + "learning_rate": 4.3895664774785654e-05, + "loss": 0.3755, + "step": 1011, + "task_loss": 0.8369073271751404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3163774013519287, + "epoch": 0.85, + "learning_rate": 4.388962685665983e-05, + "loss": 0.3321, + "step": 1012, + "task_loss": 0.30643585324287415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.11590738594532013, + "epoch": 0.86, + "learning_rate": 4.3883588938534e-05, + "loss": 0.2348, + "step": 1013, + "task_loss": 0.04274863377213478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21037417650222778, + "epoch": 0.86, + "learning_rate": 4.387755102040816e-05, + "loss": 0.3084, + "step": 1014, + "task_loss": 0.15180319547653198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1809810847043991, + "epoch": 0.86, + "learning_rate": 4.3871513102282336e-05, + "loss": 0.2583, + "step": 1015, + "task_loss": 0.919890820980072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30239373445510864, + "epoch": 0.86, + "learning_rate": 4.386547518415651e-05, + "loss": 0.3781, + "step": 1016, + "task_loss": 0.5444217920303345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.16519245505332947, + "epoch": 0.86, + "learning_rate": 4.385943726603067e-05, + "loss": 0.3502, + "step": 1017, + "task_loss": 0.12372271716594696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21786856651306152, + "epoch": 0.86, + "learning_rate": 4.3853399347904844e-05, + "loss": 0.2769, + "step": 1018, + "task_loss": 0.7423974275588989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3422611653804779, + "epoch": 0.86, + "learning_rate": 4.384736142977902e-05, + "loss": 0.4249, + "step": 1019, + "task_loss": 0.4619762897491455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2182973176240921, + "epoch": 0.86, + "learning_rate": 4.384132351165318e-05, + "loss": 0.2564, + "step": 1020, + "task_loss": 0.7951963543891907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.151831716299057, + "epoch": 0.86, + "learning_rate": 4.383528559352735e-05, + "loss": 0.4157, + "step": 1021, + "task_loss": 0.04336428642272949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1442113220691681, + "epoch": 0.86, + "learning_rate": 4.3829247675401526e-05, + "loss": 0.3081, + "step": 1022, + "task_loss": 0.010821145959198475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3111485540866852, + "epoch": 0.86, + "learning_rate": 4.3823209757275694e-05, + "loss": 0.2749, + "step": 1023, + "task_loss": 0.8378331065177917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18909099698066711, + "epoch": 0.87, + "learning_rate": 4.381717183914986e-05, + "loss": 0.2794, + "step": 1024, + "task_loss": 0.13452747464179993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2863258719444275, + "epoch": 0.87, + "learning_rate": 4.3811133921024035e-05, + "loss": 0.3628, + "step": 1025, + "task_loss": 0.5825551152229309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2623407542705536, + "epoch": 0.87, + "learning_rate": 4.38050960028982e-05, + "loss": 0.2995, + "step": 1026, + "task_loss": 0.07437510788440704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2909240424633026, + "epoch": 0.87, + "learning_rate": 4.379905808477237e-05, + "loss": 0.2935, + "step": 1027, + "task_loss": 0.3604443073272705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32045578956604004, + "epoch": 0.87, + "learning_rate": 4.379302016664654e-05, + "loss": 0.3636, + "step": 1028, + "task_loss": 0.5377231240272522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1561213582754135, + "epoch": 0.87, + "learning_rate": 4.378698224852072e-05, + "loss": 0.3283, + "step": 1029, + "task_loss": 0.5916603803634644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49331194162368774, + "epoch": 0.87, + "learning_rate": 4.378094433039488e-05, + "loss": 0.3582, + "step": 1030, + "task_loss": 0.8987064361572266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3306241035461426, + "epoch": 0.87, + "learning_rate": 4.377490641226905e-05, + "loss": 0.354, + "step": 1031, + "task_loss": 0.23180772364139557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3254731595516205, + "epoch": 0.87, + "learning_rate": 4.3768868494143225e-05, + "loss": 0.2792, + "step": 1032, + "task_loss": 0.1383245885372162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1822787970304489, + "epoch": 0.87, + "learning_rate": 4.376283057601739e-05, + "loss": 0.4297, + "step": 1033, + "task_loss": 0.09531703591346741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2677638530731201, + "epoch": 0.87, + "learning_rate": 4.375679265789156e-05, + "loss": 0.3075, + "step": 1034, + "task_loss": 0.6562163829803467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20685975253582, + "epoch": 0.87, + "learning_rate": 4.3750754739765734e-05, + "loss": 0.2237, + "step": 1035, + "task_loss": 0.6161696314811707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4285741150379181, + "epoch": 0.88, + "learning_rate": 4.37447168216399e-05, + "loss": 0.3862, + "step": 1036, + "task_loss": 0.4181616008281708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28285080194473267, + "epoch": 0.88, + "learning_rate": 4.373867890351407e-05, + "loss": 0.471, + "step": 1037, + "task_loss": 1.4052841663360596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24739590287208557, + "epoch": 0.88, + "learning_rate": 4.373264098538824e-05, + "loss": 0.3541, + "step": 1038, + "task_loss": 0.23796343803405762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40053582191467285, + "epoch": 0.88, + "learning_rate": 4.372660306726241e-05, + "loss": 0.3829, + "step": 1039, + "task_loss": 0.7183547019958496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1861676275730133, + "epoch": 0.88, + "learning_rate": 4.3720565149136576e-05, + "loss": 0.3873, + "step": 1040, + "task_loss": 0.8843746781349182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25143784284591675, + "epoch": 0.88, + "learning_rate": 4.371452723101075e-05, + "loss": 0.4188, + "step": 1041, + "task_loss": 0.32889324426651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21560464799404144, + "epoch": 0.88, + "learning_rate": 4.370848931288492e-05, + "loss": 0.2681, + "step": 1042, + "task_loss": 0.7099791765213013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23275157809257507, + "epoch": 0.88, + "learning_rate": 4.370245139475909e-05, + "loss": 0.2865, + "step": 1043, + "task_loss": 0.36161231994628906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3409966826438904, + "epoch": 0.88, + "learning_rate": 4.369641347663326e-05, + "loss": 0.2789, + "step": 1044, + "task_loss": 0.6682897210121155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1551985889673233, + "epoch": 0.88, + "learning_rate": 4.3690375558507426e-05, + "loss": 0.2918, + "step": 1045, + "task_loss": 0.6977567076683044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4235483705997467, + "epoch": 0.88, + "learning_rate": 4.36843376403816e-05, + "loss": 0.3734, + "step": 1046, + "task_loss": 0.9238823652267456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45389509201049805, + "epoch": 0.88, + "learning_rate": 4.367829972225577e-05, + "loss": 0.39, + "step": 1047, + "task_loss": 0.15337498486042023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23572328686714172, + "epoch": 0.89, + "learning_rate": 4.367226180412994e-05, + "loss": 0.3884, + "step": 1048, + "task_loss": 0.4750136137008667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20941177010536194, + "epoch": 0.89, + "learning_rate": 4.366622388600411e-05, + "loss": 0.3282, + "step": 1049, + "task_loss": 0.59443199634552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3287932276725769, + "epoch": 0.89, + "learning_rate": 4.3660185967878275e-05, + "loss": 0.3408, + "step": 1050, + "task_loss": 0.5581884980201721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23965969681739807, + "epoch": 0.89, + "learning_rate": 4.365414804975245e-05, + "loss": 0.3065, + "step": 1051, + "task_loss": 0.49103525280952454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23221686482429504, + "epoch": 0.89, + "learning_rate": 4.3648110131626617e-05, + "loss": 0.2656, + "step": 1052, + "task_loss": 0.1033593937754631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5119155049324036, + "epoch": 0.89, + "learning_rate": 4.364207221350079e-05, + "loss": 0.4183, + "step": 1053, + "task_loss": 0.9471501111984253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49145814776420593, + "epoch": 0.89, + "learning_rate": 4.363603429537496e-05, + "loss": 0.3626, + "step": 1054, + "task_loss": 0.9312189221382141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.10522010922431946, + "epoch": 0.89, + "learning_rate": 4.3629996377249125e-05, + "loss": 0.2465, + "step": 1055, + "task_loss": 0.09354358911514282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41132575273513794, + "epoch": 0.89, + "learning_rate": 4.36239584591233e-05, + "loss": 0.3323, + "step": 1056, + "task_loss": 0.7960087656974792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7278478145599365, + "epoch": 0.89, + "learning_rate": 4.3617920540997466e-05, + "loss": 0.3632, + "step": 1057, + "task_loss": 0.714910089969635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2962322533130646, + "epoch": 0.89, + "learning_rate": 4.361188262287163e-05, + "loss": 0.3442, + "step": 1058, + "task_loss": 1.0103408098220825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17133769392967224, + "epoch": 0.89, + "learning_rate": 4.360584470474581e-05, + "loss": 0.3008, + "step": 1059, + "task_loss": 0.18613861501216888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3180844485759735, + "epoch": 0.9, + "learning_rate": 4.3599806786619974e-05, + "loss": 0.4371, + "step": 1060, + "task_loss": 1.3728989362716675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2493615746498108, + "epoch": 0.9, + "learning_rate": 4.359376886849414e-05, + "loss": 0.3169, + "step": 1061, + "task_loss": 0.8047768473625183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32007667422294617, + "epoch": 0.9, + "learning_rate": 4.3587730950368315e-05, + "loss": 0.343, + "step": 1062, + "task_loss": 0.8326956629753113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26670730113983154, + "epoch": 0.9, + "learning_rate": 4.358169303224248e-05, + "loss": 0.2703, + "step": 1063, + "task_loss": 0.3882765471935272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5862593054771423, + "epoch": 0.9, + "learning_rate": 4.3575655114116657e-05, + "loss": 0.4138, + "step": 1064, + "task_loss": 1.2722101211547852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46131211519241333, + "epoch": 0.9, + "learning_rate": 4.3569617195990824e-05, + "loss": 0.4005, + "step": 1065, + "task_loss": 0.7884384989738464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6409000158309937, + "epoch": 0.9, + "learning_rate": 4.3563579277865e-05, + "loss": 0.4768, + "step": 1066, + "task_loss": 0.3153364956378937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3720138370990753, + "epoch": 0.9, + "learning_rate": 4.3557541359739165e-05, + "loss": 0.3028, + "step": 1067, + "task_loss": 0.44241032004356384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40596243739128113, + "epoch": 0.9, + "learning_rate": 4.355150344161333e-05, + "loss": 0.3424, + "step": 1068, + "task_loss": 0.8774933815002441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.281246542930603, + "epoch": 0.9, + "learning_rate": 4.3545465523487506e-05, + "loss": 0.2854, + "step": 1069, + "task_loss": 0.5583416223526001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22644507884979248, + "epoch": 0.9, + "learning_rate": 4.353942760536167e-05, + "loss": 0.3532, + "step": 1070, + "task_loss": 0.7667713165283203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27582958340644836, + "epoch": 0.9, + "learning_rate": 4.353338968723584e-05, + "loss": 0.2971, + "step": 1071, + "task_loss": 0.23659303784370422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7315882444381714, + "epoch": 0.91, + "learning_rate": 4.3527351769110014e-05, + "loss": 0.452, + "step": 1072, + "task_loss": 1.600015640258789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30030736327171326, + "epoch": 0.91, + "learning_rate": 4.352131385098418e-05, + "loss": 0.2688, + "step": 1073, + "task_loss": 0.24636352062225342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6928387880325317, + "epoch": 0.91, + "learning_rate": 4.351527593285835e-05, + "loss": 0.3641, + "step": 1074, + "task_loss": 0.560386598110199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2884763479232788, + "epoch": 0.91, + "learning_rate": 4.350923801473252e-05, + "loss": 0.2604, + "step": 1075, + "task_loss": 0.44039300084114075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2866283059120178, + "epoch": 0.91, + "learning_rate": 4.35032000966067e-05, + "loss": 0.4177, + "step": 1076, + "task_loss": 0.3292704224586487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.26677486300468445, + "epoch": 0.91, + "learning_rate": 4.349716217848086e-05, + "loss": 0.3373, + "step": 1077, + "task_loss": 1.0251530408859253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17510847747325897, + "epoch": 0.91, + "learning_rate": 4.349112426035503e-05, + "loss": 0.2605, + "step": 1078, + "task_loss": 0.4759744107723236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25535091757774353, + "epoch": 0.91, + "learning_rate": 4.3485086342229205e-05, + "loss": 0.2865, + "step": 1079, + "task_loss": 0.4177473187446594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3829230070114136, + "epoch": 0.91, + "learning_rate": 4.347904842410337e-05, + "loss": 0.3933, + "step": 1080, + "task_loss": 0.397390753030777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4474875330924988, + "epoch": 0.91, + "learning_rate": 4.347301050597754e-05, + "loss": 0.4057, + "step": 1081, + "task_loss": 1.24114191532135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30222591757774353, + "epoch": 0.91, + "learning_rate": 4.346697258785171e-05, + "loss": 0.3347, + "step": 1082, + "task_loss": 1.0539515018463135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2338535189628601, + "epoch": 0.91, + "learning_rate": 4.346093466972588e-05, + "loss": 0.2787, + "step": 1083, + "task_loss": 0.3205210268497467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2954961657524109, + "epoch": 0.92, + "learning_rate": 4.345489675160005e-05, + "loss": 0.4183, + "step": 1084, + "task_loss": 0.3001650869846344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23438599705696106, + "epoch": 0.92, + "learning_rate": 4.344885883347422e-05, + "loss": 0.3048, + "step": 1085, + "task_loss": 0.8028302192687988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2786942422389984, + "epoch": 0.92, + "learning_rate": 4.3442820915348396e-05, + "loss": 0.3152, + "step": 1086, + "task_loss": 0.5790148973464966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3665401041507721, + "epoch": 0.92, + "learning_rate": 4.3436782997222556e-05, + "loss": 0.3724, + "step": 1087, + "task_loss": 0.3731836974620819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19855603575706482, + "epoch": 0.92, + "learning_rate": 4.343074507909673e-05, + "loss": 0.3621, + "step": 1088, + "task_loss": 0.23326639831066132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3520207405090332, + "epoch": 0.92, + "learning_rate": 4.3424707160970904e-05, + "loss": 0.3503, + "step": 1089, + "task_loss": 0.35438090562820435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28581809997558594, + "epoch": 0.92, + "learning_rate": 4.3418669242845064e-05, + "loss": 0.2611, + "step": 1090, + "task_loss": 0.6348535418510437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22808696329593658, + "epoch": 0.92, + "learning_rate": 4.341263132471924e-05, + "loss": 0.3776, + "step": 1091, + "task_loss": 1.3461005687713623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20256385207176208, + "epoch": 0.92, + "learning_rate": 4.340659340659341e-05, + "loss": 0.3096, + "step": 1092, + "task_loss": 0.5651939511299133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45031237602233887, + "epoch": 0.92, + "learning_rate": 4.340055548846757e-05, + "loss": 0.3785, + "step": 1093, + "task_loss": 0.8704009056091309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6844884157180786, + "epoch": 0.92, + "learning_rate": 4.339451757034175e-05, + "loss": 0.4401, + "step": 1094, + "task_loss": 0.6413459181785583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2970822751522064, + "epoch": 0.93, + "learning_rate": 4.338847965221592e-05, + "loss": 0.2697, + "step": 1095, + "task_loss": 0.40856292843818665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2479051649570465, + "epoch": 0.93, + "learning_rate": 4.338244173409009e-05, + "loss": 0.2841, + "step": 1096, + "task_loss": 0.19519725441932678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39759886264801025, + "epoch": 0.93, + "learning_rate": 4.3376403815964255e-05, + "loss": 0.39, + "step": 1097, + "task_loss": 0.7304579019546509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49157416820526123, + "epoch": 0.93, + "learning_rate": 4.337036589783843e-05, + "loss": 0.391, + "step": 1098, + "task_loss": 0.9423661828041077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2870109975337982, + "epoch": 0.93, + "learning_rate": 4.3364327979712596e-05, + "loss": 0.2564, + "step": 1099, + "task_loss": 0.8456416130065918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.29677945375442505, + "epoch": 0.93, + "learning_rate": 4.335829006158676e-05, + "loss": 0.2936, + "step": 1100, + "task_loss": 0.43353864550590515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14345046877861023, + "epoch": 0.93, + "learning_rate": 4.335225214346094e-05, + "loss": 0.3758, + "step": 1101, + "task_loss": 0.25398945808410645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1733161211013794, + "epoch": 0.93, + "learning_rate": 4.3346214225335104e-05, + "loss": 0.2603, + "step": 1102, + "task_loss": 0.4628955125808716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20866712927818298, + "epoch": 0.93, + "learning_rate": 4.334017630720927e-05, + "loss": 0.2143, + "step": 1103, + "task_loss": 0.03302335366606712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3210042715072632, + "epoch": 0.93, + "learning_rate": 4.3334138389083446e-05, + "loss": 0.2666, + "step": 1104, + "task_loss": 0.7447173595428467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2237682193517685, + "epoch": 0.93, + "learning_rate": 4.332810047095762e-05, + "loss": 0.3293, + "step": 1105, + "task_loss": 0.8289480805397034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.362856388092041, + "epoch": 0.93, + "learning_rate": 4.332206255283179e-05, + "loss": 0.3326, + "step": 1106, + "task_loss": 0.6049726605415344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23949892818927765, + "epoch": 0.94, + "learning_rate": 4.3316024634705954e-05, + "loss": 0.3116, + "step": 1107, + "task_loss": 0.23631735146045685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4181007146835327, + "epoch": 0.94, + "learning_rate": 4.330998671658013e-05, + "loss": 0.3144, + "step": 1108, + "task_loss": 0.3326930105686188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2478361427783966, + "epoch": 0.94, + "learning_rate": 4.3303948798454295e-05, + "loss": 0.3781, + "step": 1109, + "task_loss": 0.25559258460998535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3049302399158478, + "epoch": 0.94, + "learning_rate": 4.329791088032846e-05, + "loss": 0.2925, + "step": 1110, + "task_loss": 0.4718712866306305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2884487509727478, + "epoch": 0.94, + "learning_rate": 4.3291872962202636e-05, + "loss": 0.3083, + "step": 1111, + "task_loss": 0.44975167512893677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33776795864105225, + "epoch": 0.94, + "learning_rate": 4.32858350440768e-05, + "loss": 0.3427, + "step": 1112, + "task_loss": 0.761394739151001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24266141653060913, + "epoch": 0.94, + "learning_rate": 4.327979712595097e-05, + "loss": 0.4165, + "step": 1113, + "task_loss": 0.5390146970748901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39270442724227905, + "epoch": 0.94, + "learning_rate": 4.3273759207825144e-05, + "loss": 0.3647, + "step": 1114, + "task_loss": 0.3376269042491913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25299280881881714, + "epoch": 0.94, + "learning_rate": 4.326772128969931e-05, + "loss": 0.3573, + "step": 1115, + "task_loss": 0.07655156403779984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21404612064361572, + "epoch": 0.94, + "learning_rate": 4.3261683371573486e-05, + "loss": 0.2581, + "step": 1116, + "task_loss": 0.619338870048523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2959321439266205, + "epoch": 0.94, + "learning_rate": 4.325564545344765e-05, + "loss": 0.3032, + "step": 1117, + "task_loss": 0.44233718514442444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33615487813949585, + "epoch": 0.94, + "learning_rate": 4.324960753532182e-05, + "loss": 0.3433, + "step": 1118, + "task_loss": 0.2285844087600708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17386695742607117, + "epoch": 0.95, + "learning_rate": 4.3243569617195994e-05, + "loss": 0.2761, + "step": 1119, + "task_loss": 0.3223957121372223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14410927891731262, + "epoch": 0.95, + "learning_rate": 4.323753169907016e-05, + "loss": 0.2419, + "step": 1120, + "task_loss": 0.1336435228586197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30185455083847046, + "epoch": 0.95, + "learning_rate": 4.3231493780944335e-05, + "loss": 0.2868, + "step": 1121, + "task_loss": 1.0119432210922241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24792376160621643, + "epoch": 0.95, + "learning_rate": 4.32254558628185e-05, + "loss": 0.2882, + "step": 1122, + "task_loss": 0.2847708463668823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3827042579650879, + "epoch": 0.95, + "learning_rate": 4.321941794469267e-05, + "loss": 0.3252, + "step": 1123, + "task_loss": 0.3445693850517273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2029864639043808, + "epoch": 0.95, + "learning_rate": 4.3213380026566843e-05, + "loss": 0.2912, + "step": 1124, + "task_loss": 0.9144219160079956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4248496890068054, + "epoch": 0.95, + "learning_rate": 4.320734210844101e-05, + "loss": 0.3746, + "step": 1125, + "task_loss": 0.42825064063072205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2957007884979248, + "epoch": 0.95, + "learning_rate": 4.3201304190315185e-05, + "loss": 0.3867, + "step": 1126, + "task_loss": 0.8085893988609314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2927684187889099, + "epoch": 0.95, + "learning_rate": 4.319526627218935e-05, + "loss": 0.3136, + "step": 1127, + "task_loss": 1.5016776323318481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20850490033626556, + "epoch": 0.95, + "learning_rate": 4.318922835406352e-05, + "loss": 0.3148, + "step": 1128, + "task_loss": 1.0765442848205566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3674388527870178, + "epoch": 0.95, + "learning_rate": 4.318319043593769e-05, + "loss": 0.3104, + "step": 1129, + "task_loss": 0.9712739586830139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32994064688682556, + "epoch": 0.95, + "learning_rate": 4.317715251781186e-05, + "loss": 0.3463, + "step": 1130, + "task_loss": 0.9124463796615601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20151212811470032, + "epoch": 0.96, + "learning_rate": 4.317111459968603e-05, + "loss": 0.2997, + "step": 1131, + "task_loss": 0.07413379848003387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34024250507354736, + "epoch": 0.96, + "learning_rate": 4.31650766815602e-05, + "loss": 0.41, + "step": 1132, + "task_loss": 0.10200577229261398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2040729820728302, + "epoch": 0.96, + "learning_rate": 4.315903876343437e-05, + "loss": 0.3713, + "step": 1133, + "task_loss": 0.6439746618270874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24761268496513367, + "epoch": 0.96, + "learning_rate": 4.3153000845308536e-05, + "loss": 0.3052, + "step": 1134, + "task_loss": 0.2621586322784424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25512057542800903, + "epoch": 0.96, + "learning_rate": 4.314696292718271e-05, + "loss": 0.4092, + "step": 1135, + "task_loss": 1.3069474697113037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.28453660011291504, + "epoch": 0.96, + "learning_rate": 4.3140925009056883e-05, + "loss": 0.3304, + "step": 1136, + "task_loss": 0.7997164130210876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34711629152297974, + "epoch": 0.96, + "learning_rate": 4.313488709093105e-05, + "loss": 0.3129, + "step": 1137, + "task_loss": 0.8514912724494934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2861866354942322, + "epoch": 0.96, + "learning_rate": 4.312884917280522e-05, + "loss": 0.2589, + "step": 1138, + "task_loss": 0.4697415828704834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19450606405735016, + "epoch": 0.96, + "learning_rate": 4.312281125467939e-05, + "loss": 0.2788, + "step": 1139, + "task_loss": 0.9088435173034668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45894676446914673, + "epoch": 0.96, + "learning_rate": 4.311677333655356e-05, + "loss": 0.3069, + "step": 1140, + "task_loss": 0.2890830338001251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.527074933052063, + "epoch": 0.96, + "learning_rate": 4.3110735418427726e-05, + "loss": 0.3773, + "step": 1141, + "task_loss": 0.8116379976272583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5287387371063232, + "epoch": 0.96, + "learning_rate": 4.31046975003019e-05, + "loss": 0.4034, + "step": 1142, + "task_loss": 0.8627524375915527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1560383439064026, + "epoch": 0.97, + "learning_rate": 4.309865958217607e-05, + "loss": 0.2749, + "step": 1143, + "task_loss": 0.3474748432636261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20934341847896576, + "epoch": 0.97, + "learning_rate": 4.3092621664050235e-05, + "loss": 0.2869, + "step": 1144, + "task_loss": 0.28019991517066956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25519466400146484, + "epoch": 0.97, + "learning_rate": 4.308658374592441e-05, + "loss": 0.2733, + "step": 1145, + "task_loss": 0.2840352952480316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24379102885723114, + "epoch": 0.97, + "learning_rate": 4.308054582779858e-05, + "loss": 0.3583, + "step": 1146, + "task_loss": 1.8388844728469849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2644484043121338, + "epoch": 0.97, + "learning_rate": 4.307450790967274e-05, + "loss": 0.2502, + "step": 1147, + "task_loss": 0.24468854069709778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2651967406272888, + "epoch": 0.97, + "learning_rate": 4.306846999154692e-05, + "loss": 0.3498, + "step": 1148, + "task_loss": 0.6555808186531067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.290274053812027, + "epoch": 0.97, + "learning_rate": 4.306243207342109e-05, + "loss": 0.2842, + "step": 1149, + "task_loss": 0.9025721549987793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5018739700317383, + "epoch": 0.97, + "learning_rate": 4.305639415529525e-05, + "loss": 0.3498, + "step": 1150, + "task_loss": 0.9152520895004272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3573232889175415, + "epoch": 0.97, + "learning_rate": 4.3050356237169425e-05, + "loss": 0.2871, + "step": 1151, + "task_loss": 0.8320004940032959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3362210988998413, + "epoch": 0.97, + "learning_rate": 4.30443183190436e-05, + "loss": 0.2646, + "step": 1152, + "task_loss": 0.3134423792362213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2594633400440216, + "epoch": 0.97, + "learning_rate": 4.3038280400917766e-05, + "loss": 0.3415, + "step": 1153, + "task_loss": 0.2707746922969818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.11032459884881973, + "epoch": 0.97, + "learning_rate": 4.3032242482791933e-05, + "loss": 0.3128, + "step": 1154, + "task_loss": 0.6009519696235657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42776772379875183, + "epoch": 0.98, + "learning_rate": 4.302620456466611e-05, + "loss": 0.3611, + "step": 1155, + "task_loss": 1.070064902305603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21777436137199402, + "epoch": 0.98, + "learning_rate": 4.3020166646540275e-05, + "loss": 0.308, + "step": 1156, + "task_loss": 0.24087083339691162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.14768707752227783, + "epoch": 0.98, + "learning_rate": 4.301412872841444e-05, + "loss": 0.2804, + "step": 1157, + "task_loss": 0.48272258043289185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32201606035232544, + "epoch": 0.98, + "learning_rate": 4.3008090810288616e-05, + "loss": 0.2526, + "step": 1158, + "task_loss": 0.22566260397434235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.1533641219139099, + "epoch": 0.98, + "learning_rate": 4.300205289216279e-05, + "loss": 0.3758, + "step": 1159, + "task_loss": 0.4144149124622345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2917732000350952, + "epoch": 0.98, + "learning_rate": 4.299601497403695e-05, + "loss": 0.2851, + "step": 1160, + "task_loss": 0.8044385313987732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30744171142578125, + "epoch": 0.98, + "learning_rate": 4.2989977055911124e-05, + "loss": 0.3513, + "step": 1161, + "task_loss": 0.4882700443267822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2208593338727951, + "epoch": 0.98, + "learning_rate": 4.29839391377853e-05, + "loss": 0.3355, + "step": 1162, + "task_loss": 0.5675768852233887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4546198844909668, + "epoch": 0.98, + "learning_rate": 4.297790121965946e-05, + "loss": 0.3864, + "step": 1163, + "task_loss": 1.3814033269882202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3526228070259094, + "epoch": 0.98, + "learning_rate": 4.297186330153363e-05, + "loss": 0.282, + "step": 1164, + "task_loss": 0.990058422088623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.402987539768219, + "epoch": 0.98, + "learning_rate": 4.2965825383407806e-05, + "loss": 0.2678, + "step": 1165, + "task_loss": 0.3445275127887726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4974266290664673, + "epoch": 0.99, + "learning_rate": 4.2959787465281974e-05, + "loss": 0.3386, + "step": 1166, + "task_loss": 0.6241133213043213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2324366271495819, + "epoch": 0.99, + "learning_rate": 4.295374954715614e-05, + "loss": 0.2212, + "step": 1167, + "task_loss": 0.2512488067150116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2108522355556488, + "epoch": 0.99, + "learning_rate": 4.2947711629030315e-05, + "loss": 0.263, + "step": 1168, + "task_loss": 0.9792559146881104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4749847650527954, + "epoch": 0.99, + "learning_rate": 4.294167371090448e-05, + "loss": 0.3664, + "step": 1169, + "task_loss": 0.7645936608314514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22060546278953552, + "epoch": 0.99, + "learning_rate": 4.293563579277865e-05, + "loss": 0.2002, + "step": 1170, + "task_loss": 0.2009570151567459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2716113328933716, + "epoch": 0.99, + "learning_rate": 4.292959787465282e-05, + "loss": 0.2465, + "step": 1171, + "task_loss": 0.10613411664962769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.420468270778656, + "epoch": 0.99, + "learning_rate": 4.292355995652699e-05, + "loss": 0.3057, + "step": 1172, + "task_loss": 0.9572458863258362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18062123656272888, + "epoch": 0.99, + "learning_rate": 4.291752203840116e-05, + "loss": 0.2856, + "step": 1173, + "task_loss": 0.28005337715148926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.20923954248428345, + "epoch": 0.99, + "learning_rate": 4.291148412027533e-05, + "loss": 0.2916, + "step": 1174, + "task_loss": 0.3251589238643646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23297256231307983, + "epoch": 0.99, + "learning_rate": 4.29054462021495e-05, + "loss": 0.276, + "step": 1175, + "task_loss": 1.2257041931152344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7452914118766785, + "epoch": 0.99, + "learning_rate": 4.289940828402367e-05, + "loss": 0.4171, + "step": 1176, + "task_loss": 0.6722822189331055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.13628363609313965, + "epoch": 0.99, + "learning_rate": 4.289337036589784e-05, + "loss": 0.2287, + "step": 1177, + "task_loss": 0.3862496614456177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21069841086864471, + "epoch": 1.0, + "learning_rate": 4.2887332447772014e-05, + "loss": 0.2761, + "step": 1178, + "task_loss": 0.5023462176322937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3206387758255005, + "epoch": 1.0, + "learning_rate": 4.288129452964618e-05, + "loss": 0.2972, + "step": 1179, + "task_loss": 1.333728551864624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.17135363817214966, + "epoch": 1.0, + "learning_rate": 4.287525661152035e-05, + "loss": 0.2476, + "step": 1180, + "task_loss": 0.3198869824409485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2221292406320572, + "epoch": 1.0, + "learning_rate": 4.286921869339452e-05, + "loss": 0.2624, + "step": 1181, + "task_loss": 1.1573820114135742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23098769783973694, + "epoch": 1.0, + "learning_rate": 4.286318077526869e-05, + "loss": 0.3488, + "step": 1182, + "task_loss": 0.16446726024150848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.18893688917160034, + "epoch": 1.0, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.3232, + "step": 1183, + "task_loss": 0.3497985005378723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -0.004563917405903339, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010001276362903743, + "compression/movement_sparsity/model_sparsity": 0.0009657701752313146, + "compression_loss": 0.0, + "distillation_loss": 0.15048199892044067, + "epoch": 1.0, + "learning_rate": 4.285110493901703e-05, + "loss": 0.5613, + "step": 1184, + "task_loss": 0.7848635911941528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0037974665492690463, + "compression/movement_sparsity/importance_threshold": -0.004552363190114976, + "compression/movement_sparsity/linear_layer_sparsity": 0.001039835114517798, + "compression/movement_sparsity/model_sparsity": 0.0010041135794271354, + "compression_loss": 0.41029655933380127, + "distillation_loss": 0.37748983502388, + "epoch": 1.0, + "learning_rate": 4.28450670208912e-05, + "loss": 0.8251, + "step": 1185, + "task_loss": 0.9553921818733215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007588518460622751, + "compression/movement_sparsity/importance_threshold": -0.004540828491578369, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010566362667167291, + "compression/movement_sparsity/model_sparsity": 0.0010203375603616434, + "compression_loss": 0.8199001550674438, + "distillation_loss": 0.25276631116867065, + "epoch": 1.0, + "learning_rate": 4.283902910276537e-05, + "loss": 1.1295, + "step": 1186, + "task_loss": 0.49604928493499756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011373161156410516, + "compression/movement_sparsity/importance_threshold": -0.004529313293795415, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010719111254582697, + "compression/movement_sparsity/model_sparsity": 0.0010350876807144502, + "compression_loss": 1.2288110256195068, + "distillation_loss": 0.3114812970161438, + "epoch": 1.0, + "learning_rate": 4.283299118463954e-05, + "loss": 1.5215, + "step": 1187, + "task_loss": 0.9576715230941772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015151400058979747, + "compression/movement_sparsity/importance_threshold": -0.004517817580268018, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010856477665747838, + "compression/movement_sparsity/model_sparsity": 0.0010483524259497611, + "compression_loss": 1.6370296478271484, + "distillation_loss": 0.5240944027900696, + "epoch": 1.0, + "learning_rate": 4.2826953266513706e-05, + "loss": 2.0533, + "step": 1188, + "task_loss": 0.758603036403656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018923240590679014, + "compression/movement_sparsity/importance_threshold": -0.004506341334498075, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011207525160947646, + "compression/movement_sparsity/model_sparsity": 0.0010822512193288893, + "compression_loss": 2.0445544719696045, + "distillation_loss": 0.26520106196403503, + "epoch": 1.01, + "learning_rate": 4.282091534838788e-05, + "loss": 2.3402, + "step": 1189, + "task_loss": 0.23267896473407745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022688688173855553, + "compression/movement_sparsity/importance_threshold": -0.004494884539987491, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011543428963249906, + "compression/movement_sparsity/model_sparsity": 0.0011146876666621108, + "compression_loss": 2.451387882232666, + "distillation_loss": 0.35634660720825195, + "epoch": 1.01, + "learning_rate": 4.281487743026205e-05, + "loss": 2.8116, + "step": 1190, + "task_loss": 0.2072899341583252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026447748230858936, + "compression/movement_sparsity/importance_threshold": -0.004483447180238162, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011436588421232573, + "compression/movement_sparsity/model_sparsity": 0.001104370642590202, + "compression_loss": 2.857529401779175, + "distillation_loss": 0.3354710042476654, + "epoch": 1.01, + "learning_rate": 4.2808839512136214e-05, + "loss": 3.1024, + "step": 1191, + "task_loss": 0.565451979637146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030200426184036733, + "compression/movement_sparsity/importance_threshold": -0.00447202923875199, + "compression/movement_sparsity/linear_layer_sparsity": 0.001161974363611943, + "compression/movement_sparsity/model_sparsity": 0.001122056969570617, + "compression_loss": 3.2629759311676025, + "distillation_loss": 0.33731716871261597, + "epoch": 1.01, + "learning_rate": 4.280280159401039e-05, + "loss": 3.6671, + "step": 1192, + "task_loss": 0.5615651607513428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.033946727455736514, + "compression/movement_sparsity/importance_threshold": -0.004460630699030878, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011680795374415047, + "compression/movement_sparsity/model_sparsity": 0.0011279524118974218, + "compression_loss": 3.6677308082580566, + "distillation_loss": 0.3487268090248108, + "epoch": 1.01, + "learning_rate": 4.2796763675884555e-05, + "loss": 4.0313, + "step": 1193, + "task_loss": 0.7415367960929871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03768665746830768, + "compression/movement_sparsity/importance_threshold": -0.004449251544576722, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011925121569273881, + "compression/movement_sparsity/model_sparsity": 0.0011515456957404359, + "compression_loss": 4.071794509887695, + "distillation_loss": 0.3797004520893097, + "epoch": 1.01, + "learning_rate": 4.279072575775873e-05, + "loss": 4.4957, + "step": 1194, + "task_loss": 1.3221724033355713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04142022164409781, + "compression/movement_sparsity/importance_threshold": -0.004437891758891425, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012308722042119595, + "compression/movement_sparsity/model_sparsity": 0.0011885879573914737, + "compression_loss": 4.475164413452148, + "distillation_loss": 0.21213014423847198, + "epoch": 1.01, + "learning_rate": 4.2784687839632896e-05, + "loss": 4.8729, + "step": 1195, + "task_loss": 0.08939064294099808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04514742540545447, + "compression/movement_sparsity/importance_threshold": -0.004426551325476888, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012646533711243592, + "compression/movement_sparsity/model_sparsity": 0.0012212086372974077, + "compression_loss": 4.877840995788574, + "distillation_loss": 0.18829788267612457, + "epoch": 1.01, + "learning_rate": 4.277864992150707e-05, + "loss": 5.3977, + "step": 1196, + "task_loss": 0.369351327419281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04886827417472722, + "compression/movement_sparsity/importance_threshold": -0.0044152302278350096, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012770783538009285, + "compression/movement_sparsity/model_sparsity": 0.0012332067835953192, + "compression_loss": 5.279824733734131, + "distillation_loss": 0.25111642479896545, + "epoch": 1.01, + "learning_rate": 4.277261200338124e-05, + "loss": 5.5838, + "step": 1197, + "task_loss": 0.5028445720672607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05258277337426348, + "compression/movement_sparsity/importance_threshold": -0.004403928449467692, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012772691404831025, + "compression/movement_sparsity/model_sparsity": 0.0012333910161680319, + "compression_loss": 5.681119918823242, + "distillation_loss": 0.16469311714172363, + "epoch": 1.01, + "learning_rate": 4.2766574085255405e-05, + "loss": 5.9799, + "step": 1198, + "task_loss": 0.6138965487480164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05629092842641148, + "compression/movement_sparsity/importance_threshold": -0.004392645973876834, + "compression/movement_sparsity/linear_layer_sparsity": 0.001306638365370233, + "compression/movement_sparsity/model_sparsity": 0.0012617513178299858, + "compression_loss": 6.081721305847168, + "distillation_loss": 0.23099678754806519, + "epoch": 1.01, + "learning_rate": 4.276053616712958e-05, + "loss": 6.3855, + "step": 1199, + "task_loss": 0.977968692779541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.059992744753518956, + "compression/movement_sparsity/importance_threshold": -0.0043813827845643385, + "compression/movement_sparsity/linear_layer_sparsity": 0.0013320368424346212, + "compression/movement_sparsity/model_sparsity": 0.0012862772790723576, + "compression_loss": 6.481628894805908, + "distillation_loss": 0.35345980525016785, + "epoch": 1.01, + "learning_rate": 4.2754498249003746e-05, + "loss": 6.9538, + "step": 1200, + "task_loss": 0.5519553422927856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06368822777793498, + "compression/movement_sparsity/importance_threshold": -0.004370138865032103, + "compression/movement_sparsity/linear_layer_sparsity": 0.0013814505931176375, + "compression/movement_sparsity/model_sparsity": 0.0013339935154049348, + "compression_loss": 6.880846977233887, + "distillation_loss": 0.18901535868644714, + "epoch": 1.02, + "learning_rate": 4.274846033087791e-05, + "loss": 7.1905, + "step": 1201, + "task_loss": 0.2078811377286911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06737738292200746, + "compression/movement_sparsity/importance_threshold": -0.00435891419878203, + "compression/movement_sparsity/linear_layer_sparsity": 0.001485620121584537, + "compression/movement_sparsity/model_sparsity": 0.0014345845001060432, + "compression_loss": 7.279369354248047, + "distillation_loss": 0.18496623635292053, + "epoch": 1.02, + "learning_rate": 4.274242241275209e-05, + "loss": 7.6314, + "step": 1202, + "task_loss": 0.4454924762248993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07106021560808395, + "compression/movement_sparsity/importance_threshold": -0.00434770876931602, + "compression/movement_sparsity/linear_layer_sparsity": 0.001539231179275377, + "compression/movement_sparsity/model_sparsity": 0.0014863538530382986, + "compression_loss": 7.677201747894287, + "distillation_loss": 0.40692412853240967, + "epoch": 1.02, + "learning_rate": 4.2736384494626254e-05, + "loss": 8.0471, + "step": 1203, + "task_loss": 0.9005877375602722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07473673125851371, + "compression/movement_sparsity/importance_threshold": -0.004336522560135972, + "compression/movement_sparsity/linear_layer_sparsity": 0.0015924725877695053, + "compression/movement_sparsity/model_sparsity": 0.0015377662553609233, + "compression_loss": 8.074338912963867, + "distillation_loss": 0.3484841585159302, + "epoch": 1.02, + "learning_rate": 4.273034657650042e-05, + "loss": 8.5193, + "step": 1204, + "task_loss": 1.199270248413086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07840693529564446, + "compression/movement_sparsity/importance_threshold": -0.004325355554743787, + "compression/movement_sparsity/linear_layer_sparsity": 0.0016590690640158003, + "compression/movement_sparsity/model_sparsity": 0.0016020749377734335, + "compression_loss": 8.470785140991211, + "distillation_loss": 0.2438778281211853, + "epoch": 1.02, + "learning_rate": 4.2724308658374595e-05, + "loss": 8.7582, + "step": 1205, + "task_loss": 0.17912335693836212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08207083314182378, + "compression/movement_sparsity/importance_threshold": -0.004314207736641367, + "compression/movement_sparsity/linear_layer_sparsity": 0.001721670944104081, + "compression/movement_sparsity/model_sparsity": 0.0016625262506947728, + "compression_loss": 8.866531372070312, + "distillation_loss": 0.29903608560562134, + "epoch": 1.02, + "learning_rate": 4.271827074024877e-05, + "loss": 9.2483, + "step": 1206, + "task_loss": 0.48423251509666443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08572843021940091, + "compression/movement_sparsity/importance_threshold": -0.00430307908933061, + "compression/movement_sparsity/linear_layer_sparsity": 0.0018340562240720898, + "compression/movement_sparsity/model_sparsity": 0.0017710507505583196, + "compression_loss": 9.261591911315918, + "distillation_loss": 0.270751953125, + "epoch": 1.02, + "learning_rate": 4.271223282212293e-05, + "loss": 9.5269, + "step": 1207, + "task_loss": 0.39931365847587585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08937973195072357, + "compression/movement_sparsity/importance_threshold": -0.0042919695963134176, + "compression/movement_sparsity/linear_layer_sparsity": 0.002024079759517203, + "compression/movement_sparsity/model_sparsity": 0.0019545463929801217, + "compression_loss": 9.65595817565918, + "distillation_loss": 0.33985987305641174, + "epoch": 1.02, + "learning_rate": 4.2706194903997104e-05, + "loss": 10.0822, + "step": 1208, + "task_loss": 1.621153473854065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09302474375814002, + "compression/movement_sparsity/importance_threshold": -0.0042808792410916905, + "compression/movement_sparsity/linear_layer_sparsity": 0.0020845591377663003, + "compression/movement_sparsity/model_sparsity": 0.0020129481185300327, + "compression_loss": 10.049636840820312, + "distillation_loss": 0.22262011468410492, + "epoch": 1.02, + "learning_rate": 4.270015698587128e-05, + "loss": 10.3218, + "step": 1209, + "task_loss": 0.8181881308555603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09666347106399781, + "compression/movement_sparsity/importance_threshold": -0.004269808007167331, + "compression/movement_sparsity/linear_layer_sparsity": 0.0021946311292129514, + "compression/movement_sparsity/model_sparsity": 0.002119238798449439, + "compression_loss": 10.442619323730469, + "distillation_loss": 0.41165685653686523, + "epoch": 1.02, + "learning_rate": 4.2694119067745445e-05, + "loss": 10.8912, + "step": 1210, + "task_loss": 0.9035671353340149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10029591929064618, + "compression/movement_sparsity/importance_threshold": -0.004258755878042235, + "compression/movement_sparsity/linear_layer_sparsity": 0.002374352183820679, + "compression/movement_sparsity/model_sparsity": 0.0022927858819447576, + "compression_loss": 10.8349027633667, + "distillation_loss": 0.32800403237342834, + "epoch": 1.02, + "learning_rate": 4.268808114961961e-05, + "loss": 11.2702, + "step": 1211, + "task_loss": 0.9444795250892639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10392209386043288, + "compression/movement_sparsity/importance_threshold": -0.004247722837218306, + "compression/movement_sparsity/linear_layer_sparsity": 0.002492079490889554, + "compression/movement_sparsity/model_sparsity": 0.0024064688938442587, + "compression_loss": 11.226493835449219, + "distillation_loss": 0.4187358617782593, + "epoch": 1.02, + "learning_rate": 4.2682043231493786e-05, + "loss": 11.5779, + "step": 1212, + "task_loss": 0.7167431712150574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10754200019570564, + "compression/movement_sparsity/importance_threshold": -0.004236708868197445, + "compression/movement_sparsity/linear_layer_sparsity": 0.002679622799466408, + "compression/movement_sparsity/model_sparsity": 0.0025875695128207967, + "compression_loss": 11.617390632629395, + "distillation_loss": 0.19560036063194275, + "epoch": 1.03, + "learning_rate": 4.267600531336795e-05, + "loss": 11.8148, + "step": 1213, + "task_loss": 0.23003944754600525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11115564371881353, + "compression/movement_sparsity/importance_threshold": -0.00422571395448155, + "compression/movement_sparsity/linear_layer_sparsity": 0.002910283898214542, + "compression/movement_sparsity/model_sparsity": 0.0028103066932303935, + "compression_loss": 12.007583618164062, + "distillation_loss": 0.31421175599098206, + "epoch": 1.03, + "learning_rate": 4.266996739524212e-05, + "loss": 12.3871, + "step": 1214, + "task_loss": 0.4106763005256653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11476302985210429, + "compression/movement_sparsity/importance_threshold": -0.004214738079572524, + "compression/movement_sparsity/linear_layer_sparsity": 0.003149733108510307, + "compression/movement_sparsity/model_sparsity": 0.0030415300865205674, + "compression_loss": 12.397085189819336, + "distillation_loss": 0.18430189788341522, + "epoch": 1.03, + "learning_rate": 4.2663929477116294e-05, + "loss": 12.6978, + "step": 1215, + "task_loss": 0.9206427335739136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11836416401792565, + "compression/movement_sparsity/importance_threshold": -0.004203781226972267, + "compression/movement_sparsity/linear_layer_sparsity": 0.0034414697938893314, + "compression/movement_sparsity/model_sparsity": 0.0033232447192698025, + "compression_loss": 12.785908699035645, + "distillation_loss": 0.21479223668575287, + "epoch": 1.03, + "learning_rate": 4.265789155899047e-05, + "loss": 13.1885, + "step": 1216, + "task_loss": 0.3621373474597931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12195905163862686, + "compression/movement_sparsity/importance_threshold": -0.004192843380182677, + "compression/movement_sparsity/linear_layer_sparsity": 0.0037257657986635774, + "compression/movement_sparsity/model_sparsity": 0.003597774281683244, + "compression_loss": 13.174031257629395, + "distillation_loss": 0.30214691162109375, + "epoch": 1.03, + "learning_rate": 4.265185364086463e-05, + "loss": 13.4975, + "step": 1217, + "task_loss": 0.43067657947540283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1255476981365553, + "compression/movement_sparsity/importance_threshold": -0.0041819245227056575, + "compression/movement_sparsity/linear_layer_sparsity": 0.004055433261292282, + "compression/movement_sparsity/model_sparsity": 0.0039161166527949135, + "compression_loss": 13.561464309692383, + "distillation_loss": 0.20788107812404633, + "epoch": 1.03, + "learning_rate": 4.26458157227388e-05, + "loss": 13.8864, + "step": 1218, + "task_loss": 0.7008878588676453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12913010893405957, + "compression/movement_sparsity/importance_threshold": -0.004171024638043107, + "compression/movement_sparsity/linear_layer_sparsity": 0.004409354480892332, + "compression/movement_sparsity/model_sparsity": 0.004257879589712679, + "compression_loss": 13.948198318481445, + "distillation_loss": 0.25103408098220825, + "epoch": 1.03, + "learning_rate": 4.2639777804612977e-05, + "loss": 14.2062, + "step": 1219, + "task_loss": 0.31897789239883423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1327062894534874, + "compression/movement_sparsity/importance_threshold": -0.004160143709696928, + "compression/movement_sparsity/linear_layer_sparsity": 0.004785025382260197, + "compression/movement_sparsity/model_sparsity": 0.004620645039919687, + "compression_loss": 14.334238052368164, + "distillation_loss": 0.14130473136901855, + "epoch": 1.03, + "learning_rate": 4.263373988648714e-05, + "loss": 14.6271, + "step": 1220, + "task_loss": 0.49916383624076843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1362762451171875, + "compression/movement_sparsity/importance_threshold": -0.004149281721169018, + "compression/movement_sparsity/linear_layer_sparsity": 0.005210288896825615, + "compression/movement_sparsity/model_sparsity": 0.005031299444496189, + "compression_loss": 14.719594955444336, + "distillation_loss": 0.38804569840431213, + "epoch": 1.03, + "learning_rate": 4.262770196836131e-05, + "loss": 15.0129, + "step": 1221, + "task_loss": 1.1559282541275024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13983998134750797, + "compression/movement_sparsity/importance_threshold": -0.00413843865596128, + "compression/movement_sparsity/linear_layer_sparsity": 0.0055203172553580536, + "compression/movement_sparsity/model_sparsity": 0.00533067737515425, + "compression_loss": 15.104253768920898, + "distillation_loss": 0.22257769107818604, + "epoch": 1.03, + "learning_rate": 4.2621664050235485e-05, + "loss": 15.4226, + "step": 1222, + "task_loss": 0.5349777936935425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14339750356679637, + "compression/movement_sparsity/importance_threshold": -0.004127614497575614, + "compression/movement_sparsity/linear_layer_sparsity": 0.005974425331434624, + "compression/movement_sparsity/model_sparsity": 0.005769185441817747, + "compression_loss": 15.488214492797852, + "distillation_loss": 0.27086007595062256, + "epoch": 1.03, + "learning_rate": 4.2615626132109645e-05, + "loss": 15.8748, + "step": 1223, + "task_loss": 0.06427006423473358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14694881719740194, + "compression/movement_sparsity/importance_threshold": -0.004116809229513919, + "compression/movement_sparsity/linear_layer_sparsity": 0.006461134081827645, + "compression/movement_sparsity/model_sparsity": 0.006239174249343519, + "compression_loss": 15.871499061584473, + "distillation_loss": 0.38948312401771545, + "epoch": 1.03, + "learning_rate": 4.260958821398382e-05, + "loss": 16.2235, + "step": 1224, + "task_loss": 0.883831799030304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15049392766167224, + "compression/movement_sparsity/importance_threshold": -0.004106022835278097, + "compression/movement_sparsity/linear_layer_sparsity": 0.007034257275077765, + "compression/movement_sparsity/model_sparsity": 0.006792608897772327, + "compression_loss": 16.254091262817383, + "distillation_loss": 0.3500271439552307, + "epoch": 1.04, + "learning_rate": 4.260355029585799e-05, + "loss": 16.6056, + "step": 1225, + "task_loss": 0.7578271627426147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1540328403819557, + "compression/movement_sparsity/importance_threshold": -0.004095255298370047, + "compression/movement_sparsity/linear_layer_sparsity": 0.007625135553937685, + "compression/movement_sparsity/model_sparsity": 0.007363188689999208, + "compression_loss": 16.636003494262695, + "distillation_loss": 0.2130354642868042, + "epoch": 1.04, + "learning_rate": 4.259751237773216e-05, + "loss": 16.9742, + "step": 1226, + "task_loss": 0.3529678285121918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15756556078059986, + "compression/movement_sparsity/importance_threshold": -0.004084506602291672, + "compression/movement_sparsity/linear_layer_sparsity": 0.008295381092581908, + "compression/movement_sparsity/model_sparsity": 0.008010409232474551, + "compression_loss": 17.017234802246094, + "distillation_loss": 0.19248083233833313, + "epoch": 1.04, + "learning_rate": 4.259147445960633e-05, + "loss": 17.3515, + "step": 1227, + "task_loss": 0.5984662771224976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1610920942799538, + "compression/movement_sparsity/importance_threshold": -0.0040737767305448706, + "compression/movement_sparsity/linear_layer_sparsity": 0.009028383525493679, + "compression/movement_sparsity/model_sparsity": 0.008718230776836563, + "compression_loss": 17.397768020629883, + "distillation_loss": 0.37513667345046997, + "epoch": 1.04, + "learning_rate": 4.25854365414805e-05, + "loss": 17.7558, + "step": 1228, + "task_loss": 1.1217031478881836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16461244630236543, + "compression/movement_sparsity/importance_threshold": -0.004063065666631543, + "compression/movement_sparsity/linear_layer_sparsity": 0.009724933777942618, + "compression/movement_sparsity/model_sparsity": 0.009390852385274663, + "compression_loss": 17.777629852294922, + "distillation_loss": 0.2674877643585205, + "epoch": 1.04, + "learning_rate": 4.257939862335467e-05, + "loss": 18.0458, + "step": 1229, + "task_loss": 0.45754510164260864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1681266222701825, + "compression/movement_sparsity/importance_threshold": -0.004052373394053592, + "compression/movement_sparsity/linear_layer_sparsity": 0.010412719767179196, + "compression/movement_sparsity/model_sparsity": 0.010055010809903777, + "compression_loss": 18.15679359436035, + "distillation_loss": 0.31084680557250977, + "epoch": 1.04, + "learning_rate": 4.2573360705228836e-05, + "loss": 18.5965, + "step": 1230, + "task_loss": 0.2089879810810089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17163462760575404, + "compression/movement_sparsity/importance_threshold": -0.004041699896312914, + "compression/movement_sparsity/linear_layer_sparsity": 0.011026683234582146, + "compression/movement_sparsity/model_sparsity": 0.010647882743428887, + "compression_loss": 18.53529167175293, + "distillation_loss": 0.48372378945350647, + "epoch": 1.04, + "learning_rate": 4.256732278710301e-05, + "loss": 18.9975, + "step": 1231, + "task_loss": 0.28310129046440125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17513646773142766, + "compression/movement_sparsity/importance_threshold": -0.004031045156911412, + "compression/movement_sparsity/linear_layer_sparsity": 0.01184038843405344, + "compression/movement_sparsity/model_sparsity": 0.011433634666048352, + "compression_loss": 18.91310691833496, + "distillation_loss": 0.625645101070404, + "epoch": 1.04, + "learning_rate": 4.256128486897718e-05, + "loss": 19.3991, + "step": 1232, + "task_loss": 1.3038017749786377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17863214806955124, + "compression/movement_sparsity/importance_threshold": -0.004020409159350988, + "compression/movement_sparsity/linear_layer_sparsity": 0.012480310814399664, + "compression/movement_sparsity/model_sparsity": 0.012051573743998178, + "compression_loss": 19.290283203125, + "distillation_loss": 0.398506224155426, + "epoch": 1.04, + "learning_rate": 4.2555246950851344e-05, + "loss": 19.8206, + "step": 1233, + "task_loss": 0.9363745450973511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18212167404247404, + "compression/movement_sparsity/importance_threshold": -0.004009791887133538, + "compression/movement_sparsity/linear_layer_sparsity": 0.013246140480812964, + "compression/movement_sparsity/model_sparsity": 0.01279109480540256, + "compression_loss": 19.66680145263672, + "distillation_loss": 0.7033413648605347, + "epoch": 1.04, + "learning_rate": 4.254920903272552e-05, + "loss": 20.199, + "step": 1234, + "task_loss": 0.18566125631332397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18560505107254344, + "compression/movement_sparsity/importance_threshold": -0.003999193323760967, + "compression/movement_sparsity/linear_layer_sparsity": 0.013944837083436358, + "compression/movement_sparsity/model_sparsity": 0.013465789030283679, + "compression_loss": 20.042619705200195, + "distillation_loss": 0.6369860172271729, + "epoch": 1.04, + "learning_rate": 4.254317111459969e-05, + "loss": 20.5969, + "step": 1235, + "task_loss": 0.8035876750946045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18908228458210785, + "compression/movement_sparsity/importance_threshold": -0.0039886134527351725, + "compression/movement_sparsity/linear_layer_sparsity": 0.014540878526882607, + "compression/movement_sparsity/model_sparsity": 0.014041354616509594, + "compression_loss": 20.417795181274414, + "distillation_loss": 0.680345892906189, + "epoch": 1.04, + "learning_rate": 4.253713319647385e-05, + "loss": 20.9379, + "step": 1236, + "task_loss": 0.9641849398612976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19255337999351502, + "compression/movement_sparsity/importance_threshold": -0.0039780522575580575, + "compression/movement_sparsity/linear_layer_sparsity": 0.015364123060462588, + "compression/movement_sparsity/model_sparsity": 0.01483631816776469, + "compression_loss": 20.79230499267578, + "distillation_loss": 0.6249008774757385, + "epoch": 1.05, + "learning_rate": 4.2531095278348026e-05, + "loss": 21.3149, + "step": 1237, + "task_loss": 0.7482644319534302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.196018342729114, + "compression/movement_sparsity/importance_threshold": -0.003967509721731519, + "compression/movement_sparsity/linear_layer_sparsity": 0.016132814527108498, + "compression/movement_sparsity/model_sparsity": 0.015578602717759764, + "compression_loss": 21.166133880615234, + "distillation_loss": 0.3550962805747986, + "epoch": 1.05, + "learning_rate": 4.25250573602222e-05, + "loss": 21.7123, + "step": 1238, + "task_loss": 0.7569570541381836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19947717821125255, + "compression/movement_sparsity/importance_threshold": -0.003956985828757461, + "compression/movement_sparsity/linear_layer_sparsity": 0.017022476674420237, + "compression/movement_sparsity/model_sparsity": 0.01643770223339045, + "compression_loss": 21.539316177368164, + "distillation_loss": 0.19553369283676147, + "epoch": 1.05, + "learning_rate": 4.251901944209637e-05, + "loss": 22.049, + "step": 1239, + "task_loss": 0.10882483422756195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20292989186227856, + "compression/movement_sparsity/importance_threshold": -0.003946480562137782, + "compression/movement_sparsity/linear_layer_sparsity": 0.01774728720416617, + "compression/movement_sparsity/model_sparsity": 0.017137613291661614, + "compression_loss": 21.91181755065918, + "distillation_loss": 0.40565478801727295, + "epoch": 1.05, + "learning_rate": 4.2512981523970535e-05, + "loss": 22.3305, + "step": 1240, + "task_loss": 0.5878121256828308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20637648910454076, + "compression/movement_sparsity/importance_threshold": -0.003935993905374383, + "compression/movement_sparsity/linear_layer_sparsity": 0.018443837456615108, + "compression/movement_sparsity/model_sparsity": 0.017810234900099715, + "compression_loss": 22.28365707397461, + "distillation_loss": 0.33983078598976135, + "epoch": 1.05, + "learning_rate": 4.250694360584471e-05, + "loss": 22.7013, + "step": 1241, + "task_loss": 0.16688188910484314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20981697536038724, + "compression/movement_sparsity/importance_threshold": -0.003925525841969164, + "compression/movement_sparsity/linear_layer_sparsity": 0.01903546695803609, + "compression/movement_sparsity/model_sparsity": 0.01838154010808165, + "compression_loss": 22.654813766479492, + "distillation_loss": 0.5087032914161682, + "epoch": 1.05, + "learning_rate": 4.2500905687718876e-05, + "loss": 23.1244, + "step": 1242, + "task_loss": 0.8088805079460144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21325135605216555, + "compression/movement_sparsity/importance_threshold": -0.003915076355424027, + "compression/movement_sparsity/linear_layer_sparsity": 0.019716790048414028, + "compression/movement_sparsity/model_sparsity": 0.019039457654310123, + "compression_loss": 23.025293350219727, + "distillation_loss": 0.7074320912361145, + "epoch": 1.05, + "learning_rate": 4.249486776959304e-05, + "loss": 23.543, + "step": 1243, + "task_loss": 1.3079643249511719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21667963660222495, + "compression/movement_sparsity/importance_threshold": -0.0039046454292408694, + "compression/movement_sparsity/linear_layer_sparsity": 0.020562928983854867, + "compression/movement_sparsity/model_sparsity": 0.019856529114290738, + "compression_loss": 23.395111083984375, + "distillation_loss": 0.35303163528442383, + "epoch": 1.05, + "learning_rate": 4.248882985146722e-05, + "loss": 23.8201, + "step": 1244, + "task_loss": 0.8302319645881653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22010182243291299, + "compression/movement_sparsity/importance_threshold": -0.003894233046921594, + "compression/movement_sparsity/linear_layer_sparsity": 0.021390001175245962, + "compression/movement_sparsity/model_sparsity": 0.020655188831535883, + "compression_loss": 23.764244079589844, + "distillation_loss": 0.5794785618782043, + "epoch": 1.05, + "learning_rate": 4.2482791933341384e-05, + "loss": 24.3654, + "step": 1245, + "task_loss": 0.6306720972061157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2235179189665779, + "compression/movement_sparsity/importance_threshold": -0.003883839191968101, + "compression/movement_sparsity/linear_layer_sparsity": 0.022431517597400415, + "compression/movement_sparsity/model_sparsity": 0.021660925960510048, + "compression_loss": 24.132734298706055, + "distillation_loss": 0.35249751806259155, + "epoch": 1.05, + "learning_rate": 4.247675401521555e-05, + "loss": 24.6967, + "step": 1246, + "task_loss": 0.5752303004264832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22692793162556762, + "compression/movement_sparsity/importance_threshold": -0.003873463847882292, + "compression/movement_sparsity/linear_layer_sparsity": 0.02340377845392578, + "compression/movement_sparsity/model_sparsity": 0.02259978666558952, + "compression_loss": 24.500560760498047, + "distillation_loss": 0.32665061950683594, + "epoch": 1.05, + "learning_rate": 4.2470716097089725e-05, + "loss": 25.0279, + "step": 1247, + "task_loss": 0.7009742259979248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23033186583223103, + "compression/movement_sparsity/importance_threshold": -0.003863106998166065, + "compression/movement_sparsity/linear_layer_sparsity": 0.024242309770414937, + "compression/movement_sparsity/model_sparsity": 0.02340951185173322, + "compression_loss": 24.867727279663086, + "distillation_loss": 0.6337481737136841, + "epoch": 1.05, + "learning_rate": 4.246467817896389e-05, + "loss": 25.442, + "step": 1248, + "task_loss": 0.6848210692405701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23372972700891603, + "compression/movement_sparsity/importance_threshold": -0.0038527686263213207, + "compression/movement_sparsity/linear_layer_sparsity": 0.02507800313500676, + "compression/movement_sparsity/model_sparsity": 0.024216496578357814, + "compression_loss": 25.234224319458008, + "distillation_loss": 0.8047425150871277, + "epoch": 1.06, + "learning_rate": 4.2458640260838067e-05, + "loss": 25.6424, + "step": 1249, + "task_loss": 0.4596101641654968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23712152057797037, + "compression/movement_sparsity/importance_threshold": -0.0038424487158499623, + "compression/movement_sparsity/linear_layer_sparsity": 0.02585722364167514, + "compression/movement_sparsity/model_sparsity": 0.024968948463459466, + "compression_loss": 25.60004997253418, + "distillation_loss": 0.5374100208282471, + "epoch": 1.06, + "learning_rate": 4.2452602342712234e-05, + "loss": 26.2348, + "step": 1250, + "task_loss": 0.6199985146522522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24050725196174294, + "compression/movement_sparsity/importance_threshold": -0.0038321472502538868, + "compression/movement_sparsity/linear_layer_sparsity": 0.026712901911224666, + "compression/movement_sparsity/model_sparsity": 0.025795231552075713, + "compression_loss": 25.96520233154297, + "distillation_loss": 0.7549915313720703, + "epoch": 1.06, + "learning_rate": 4.244656442458641e-05, + "loss": 26.4838, + "step": 1251, + "task_loss": 0.9914858341217041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24388692658258165, + "compression/movement_sparsity/importance_threshold": -0.0038218642130349967, + "compression/movement_sparsity/linear_layer_sparsity": 0.027609229668244854, + "compression/movement_sparsity/model_sparsity": 0.026660767693215553, + "compression_loss": 26.329713821411133, + "distillation_loss": 0.3892171084880829, + "epoch": 1.06, + "learning_rate": 4.2440526506460575e-05, + "loss": 26.8058, + "step": 1252, + "task_loss": 1.5135600566864014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24726054986283408, + "compression/movement_sparsity/importance_threshold": -0.0038115995876951934, + "compression/movement_sparsity/linear_layer_sparsity": 0.02852368216007155, + "compression/movement_sparsity/model_sparsity": 0.027543805928763092, + "compression_loss": 26.693553924560547, + "distillation_loss": 0.5658009052276611, + "epoch": 1.06, + "learning_rate": 4.243448858833474e-05, + "loss": 27.2668, + "step": 1253, + "task_loss": 1.0536291599273682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2506281272248496, + "compression/movement_sparsity/importance_threshold": -0.0038013533577363743, + "compression/movement_sparsity/linear_layer_sparsity": 0.029259713331562832, + "compression/movement_sparsity/model_sparsity": 0.028254552165216915, + "compression_loss": 27.056703567504883, + "distillation_loss": 0.48408210277557373, + "epoch": 1.06, + "learning_rate": 4.2428450670208916e-05, + "loss": 27.5278, + "step": 1254, + "task_loss": 0.7247920036315918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.25398966409097584, + "compression/movement_sparsity/importance_threshold": -0.0037911255066604413, + "compression/movement_sparsity/linear_layer_sparsity": 0.03041721613231133, + "compression/movement_sparsity/model_sparsity": 0.029372291183864578, + "compression_loss": 27.419187545776367, + "distillation_loss": 0.2873002588748932, + "epoch": 1.06, + "learning_rate": 4.242241275208308e-05, + "loss": 27.8572, + "step": 1255, + "task_loss": 0.9619093537330627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2573451658835608, + "compression/movement_sparsity/importance_threshold": -0.003780916017969296, + "compression/movement_sparsity/linear_layer_sparsity": 0.031211651876883066, + "compression/movement_sparsity/model_sparsity": 0.030139435616640065, + "compression_loss": 27.781023025512695, + "distillation_loss": 0.6032617688179016, + "epoch": 1.06, + "learning_rate": 4.241637483395725e-05, + "loss": 28.2365, + "step": 1256, + "task_loss": 0.5064491629600525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.26069463802495263, + "compression/movement_sparsity/importance_threshold": -0.003770724875164838, + "compression/movement_sparsity/linear_layer_sparsity": 0.032219232117945855, + "compression/movement_sparsity/model_sparsity": 0.031112402376742967, + "compression_loss": 28.142189025878906, + "distillation_loss": 0.44703471660614014, + "epoch": 1.06, + "learning_rate": 4.2410336915831424e-05, + "loss": 28.5133, + "step": 1257, + "task_loss": 0.6577385067939758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2640380859375, + "compression/movement_sparsity/importance_threshold": -0.0037605520617489674, + "compression/movement_sparsity/linear_layer_sparsity": 0.03317983113852334, + "compression/movement_sparsity/model_sparsity": 0.03204000186581538, + "compression_loss": 28.50269317626953, + "distillation_loss": 0.5529592633247375, + "epoch": 1.06, + "learning_rate": 4.240429899770559e-05, + "loss": 28.9456, + "step": 1258, + "task_loss": 0.6998945474624634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.26737551504355084, + "compression/movement_sparsity/importance_threshold": -0.003750397561223585, + "compression/movement_sparsity/linear_layer_sparsity": 0.03410360832944129, + "compression/movement_sparsity/model_sparsity": 0.03293204446835425, + "compression_loss": 28.862529754638672, + "distillation_loss": 0.20232579112052917, + "epoch": 1.06, + "learning_rate": 4.2398261079579765e-05, + "loss": 29.1511, + "step": 1259, + "task_loss": 0.04054655879735947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27070693076545305, + "compression/movement_sparsity/importance_threshold": -0.003740261357090592, + "compression/movement_sparsity/linear_layer_sparsity": 0.035138053720187676, + "compression/movement_sparsity/model_sparsity": 0.033930953477602256, + "compression_loss": 29.22166633605957, + "distillation_loss": 0.5860152244567871, + "epoch": 1.07, + "learning_rate": 4.239222316145393e-05, + "loss": 29.6411, + "step": 1260, + "task_loss": 0.6404820084571838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27403233852555536, + "compression/movement_sparsity/importance_threshold": -0.003730143432851887, + "compression/movement_sparsity/linear_layer_sparsity": 0.03619793336050136, + "compression/movement_sparsity/model_sparsity": 0.03495442299170001, + "compression_loss": 29.58013153076172, + "distillation_loss": 0.44030773639678955, + "epoch": 1.07, + "learning_rate": 4.23861852433281e-05, + "loss": 30.0218, + "step": 1261, + "task_loss": 0.8194950819015503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27735174374620586, + "compression/movement_sparsity/importance_threshold": -0.0037200437720093715, + "compression/movement_sparsity/linear_layer_sparsity": 0.037563035919790225, + "compression/movement_sparsity/model_sparsity": 0.036272630078530634, + "compression_loss": 29.937925338745117, + "distillation_loss": 0.3283558189868927, + "epoch": 1.07, + "learning_rate": 4.2380147325202274e-05, + "loss": 30.3759, + "step": 1262, + "task_loss": 0.4912737309932709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2806651518497526, + "compression/movement_sparsity/importance_threshold": -0.0037099623580649464, + "compression/movement_sparsity/linear_layer_sparsity": 0.03880109839708661, + "compression/movement_sparsity/model_sparsity": 0.03746816130100621, + "compression_loss": 30.29505729675293, + "distillation_loss": 0.34929248690605164, + "epoch": 1.07, + "learning_rate": 4.237410940707644e-05, + "loss": 30.7296, + "step": 1263, + "task_loss": 0.3152148127555847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.28397256825854333, + "compression/movement_sparsity/importance_threshold": -0.0036998991745205126, + "compression/movement_sparsity/linear_layer_sparsity": 0.03984663326373435, + "compression/movement_sparsity/model_sparsity": 0.03847777882854314, + "compression_loss": 30.651500701904297, + "distillation_loss": 0.46013110876083374, + "epoch": 1.07, + "learning_rate": 4.236807148895061e-05, + "loss": 31.1531, + "step": 1264, + "task_loss": 0.5476149916648865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2872739983949271, + "compression/movement_sparsity/importance_threshold": -0.003689854204877969, + "compression/movement_sparsity/linear_layer_sparsity": 0.041110833516488546, + "compression/movement_sparsity/model_sparsity": 0.039698549913480354, + "compression_loss": 31.00731086730957, + "distillation_loss": 0.42440980672836304, + "epoch": 1.07, + "learning_rate": 4.236203357082478e-05, + "loss": 31.4518, + "step": 1265, + "task_loss": 0.7424221038818359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2905694476812517, + "compression/movement_sparsity/importance_threshold": -0.0036798274326392162, + "compression/movement_sparsity/linear_layer_sparsity": 0.04239200185978858, + "compression/movement_sparsity/model_sparsity": 0.0409357061828532, + "compression_loss": 31.36248207092285, + "distillation_loss": 0.571715235710144, + "epoch": 1.07, + "learning_rate": 4.235599565269895e-05, + "loss": 32.0213, + "step": 1266, + "task_loss": 1.1389743089675903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.293858921539865, + "compression/movement_sparsity/importance_threshold": -0.003669818841306157, + "compression/movement_sparsity/linear_layer_sparsity": 0.04383340716777934, + "compression/movement_sparsity/model_sparsity": 0.042327594784233084, + "compression_loss": 31.717010498046875, + "distillation_loss": 0.3843132555484772, + "epoch": 1.07, + "learning_rate": 4.234995773457312e-05, + "loss": 32.2031, + "step": 1267, + "task_loss": 1.3387547731399536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.29714242539311586, + "compression/movement_sparsity/importance_threshold": -0.003659828414380689, + "compression/movement_sparsity/linear_layer_sparsity": 0.04498444706966919, + "compression/movement_sparsity/model_sparsity": 0.04343909292448011, + "compression_loss": 32.070865631103516, + "distillation_loss": 0.3448026478290558, + "epoch": 1.07, + "learning_rate": 4.234391981644729e-05, + "loss": 32.6186, + "step": 1268, + "task_loss": 0.9032401442527771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.30041996466335225, + "compression/movement_sparsity/importance_threshold": -0.0036498561353647133, + "compression/movement_sparsity/linear_layer_sparsity": 0.04615513799982295, + "compression/movement_sparsity/model_sparsity": 0.04456956701971654, + "compression_loss": 32.424102783203125, + "distillation_loss": 0.5070450901985168, + "epoch": 1.07, + "learning_rate": 4.2337881898321464e-05, + "loss": 32.7972, + "step": 1269, + "task_loss": 0.8359112739562988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3036915447729217, + "compression/movement_sparsity/importance_threshold": -0.0036399019877601324, + "compression/movement_sparsity/linear_layer_sparsity": 0.04731974760448242, + "compression/movement_sparsity/model_sparsity": 0.04569416870169775, + "compression_loss": 32.77665328979492, + "distillation_loss": 0.44396933913230896, + "epoch": 1.07, + "learning_rate": 4.233184398019563e-05, + "loss": 33.1326, + "step": 1270, + "task_loss": 0.5468348264694214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3069571711441735, + "compression/movement_sparsity/importance_threshold": -0.0036299659550688436, + "compression/movement_sparsity/linear_layer_sparsity": 0.048803698342597936, + "compression/movement_sparsity/model_sparsity": 0.047127141166792556, + "compression_loss": 33.12855529785156, + "distillation_loss": 0.3639471232891083, + "epoch": 1.07, + "learning_rate": 4.23258060620698e-05, + "loss": 33.5157, + "step": 1271, + "task_loss": 0.41027218103408813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31021684919945514, + "compression/movement_sparsity/importance_threshold": -0.0036200480207927497, + "compression/movement_sparsity/linear_layer_sparsity": 0.050247977374988935, + "compression/movement_sparsity/model_sparsity": 0.04852180477129892, + "compression_loss": 33.479766845703125, + "distillation_loss": 0.4434579312801361, + "epoch": 1.08, + "learning_rate": 4.231976814394397e-05, + "loss": 33.8952, + "step": 1272, + "task_loss": 0.8820655941963196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31347058436111475, + "compression/movement_sparsity/importance_threshold": -0.003610148168433751, + "compression/movement_sparsity/linear_layer_sparsity": 0.05193835930321651, + "compression/movement_sparsity/model_sparsity": 0.05015411688006882, + "compression_loss": 33.83030319213867, + "distillation_loss": 0.33706918358802795, + "epoch": 1.08, + "learning_rate": 4.231373022581814e-05, + "loss": 34.2512, + "step": 1273, + "task_loss": 0.6071690320968628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31671838205150105, + "compression/movement_sparsity/importance_threshold": -0.0036002663814937463, + "compression/movement_sparsity/linear_layer_sparsity": 0.053383806904035824, + "compression/movement_sparsity/model_sparsity": 0.05154990890908306, + "compression_loss": 34.18016815185547, + "distillation_loss": 0.5281939506530762, + "epoch": 1.08, + "learning_rate": 4.230769230769231e-05, + "loss": 34.661, + "step": 1274, + "task_loss": 1.3885276317596436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31996024769296194, + "compression/movement_sparsity/importance_threshold": -0.0035904026434746376, + "compression/movement_sparsity/linear_layer_sparsity": 0.05498815596277065, + "compression/movement_sparsity/model_sparsity": 0.053099143642095344, + "compression_loss": 34.529396057128906, + "distillation_loss": 0.41586706042289734, + "epoch": 1.08, + "learning_rate": 4.230165438956648e-05, + "loss": 35.0392, + "step": 1275, + "task_loss": 0.5115939974784851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.32319618670784567, + "compression/movement_sparsity/importance_threshold": -0.0035805569378783244, + "compression/movement_sparsity/linear_layer_sparsity": 0.05621701298265215, + "compression/movement_sparsity/model_sparsity": 0.05428578564293754, + "compression_loss": 34.87797546386719, + "distillation_loss": 0.6999092698097229, + "epoch": 1.08, + "learning_rate": 4.229561647144065e-05, + "loss": 35.4152, + "step": 1276, + "task_loss": 0.4918098449707031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.32642620451849996, + "compression/movement_sparsity/importance_threshold": -0.0035707292482067094, + "compression/movement_sparsity/linear_layer_sparsity": 0.05768861028309691, + "compression/movement_sparsity/model_sparsity": 0.0557068290489492, + "compression_loss": 35.22592544555664, + "distillation_loss": 0.3220058083534241, + "epoch": 1.08, + "learning_rate": 4.2289578553314815e-05, + "loss": 35.6123, + "step": 1277, + "task_loss": 0.20578202605247498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3296503065472739, + "compression/movement_sparsity/importance_threshold": -0.0035609195579616895, + "compression/movement_sparsity/linear_layer_sparsity": 0.05917731876409914, + "compression/movement_sparsity/model_sparsity": 0.05714439581382603, + "compression_loss": 35.57320022583008, + "distillation_loss": 0.5208441615104675, + "epoch": 1.08, + "learning_rate": 4.228354063518899e-05, + "loss": 36.0763, + "step": 1278, + "task_loss": 0.7269611954689026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33286849821651504, + "compression/movement_sparsity/importance_threshold": -0.0035511278506451674, + "compression/movement_sparsity/linear_layer_sparsity": 0.060484100219481, + "compression/movement_sparsity/model_sparsity": 0.05840628530608555, + "compression_loss": 35.9197998046875, + "distillation_loss": 0.6241443753242493, + "epoch": 1.08, + "learning_rate": 4.227750271706316e-05, + "loss": 36.9221, + "step": 1279, + "task_loss": 0.06261111795902252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33608078494857196, + "compression/movement_sparsity/importance_threshold": -0.003541354109759043, + "compression/movement_sparsity/linear_layer_sparsity": 0.06205791148490038, + "compression/movement_sparsity/model_sparsity": 0.059926031312928014, + "compression_loss": 36.26581954956055, + "distillation_loss": 1.0040439367294312, + "epoch": 1.08, + "learning_rate": 4.2271464798937324e-05, + "loss": 37.0246, + "step": 1280, + "task_loss": 1.213135838508606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3392871721657921, + "compression/movement_sparsity/importance_threshold": -0.0035315983188052187, + "compression/movement_sparsity/linear_layer_sparsity": 0.06349304468071468, + "compression/movement_sparsity/model_sparsity": 0.061311863268479976, + "compression_loss": 36.61121368408203, + "distillation_loss": 0.634764552116394, + "epoch": 1.08, + "learning_rate": 4.22654268808115e-05, + "loss": 37.2573, + "step": 1281, + "task_loss": 1.138067603111267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3424876652905246, + "compression/movement_sparsity/importance_threshold": -0.0035218604612855912, + "compression/movement_sparsity/linear_layer_sparsity": 0.06481805818841177, + "compression/movement_sparsity/model_sparsity": 0.06259135848596935, + "compression_loss": 36.95594024658203, + "distillation_loss": 0.6046814918518066, + "epoch": 1.08, + "learning_rate": 4.225938896268567e-05, + "loss": 37.4981, + "step": 1282, + "task_loss": 1.0641181468963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3456822697451173, + "compression/movement_sparsity/importance_threshold": -0.0035121405207020635, + "compression/movement_sparsity/linear_layer_sparsity": 0.06629898018794778, + "compression/movement_sparsity/model_sparsity": 0.06402140625897235, + "compression_loss": 37.30006408691406, + "distillation_loss": 0.7174438834190369, + "epoch": 1.08, + "learning_rate": 4.225335104455984e-05, + "loss": 38.0405, + "step": 1283, + "task_loss": 0.6762393116950989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34887099095191787, + "compression/movement_sparsity/importance_threshold": -0.0035024384805565365, + "compression/movement_sparsity/linear_layer_sparsity": 0.06784382765017964, + "compression/movement_sparsity/model_sparsity": 0.06551318345836987, + "compression_loss": 37.643531799316406, + "distillation_loss": 0.5281542539596558, + "epoch": 1.09, + "learning_rate": 4.2247313126434006e-05, + "loss": 38.19, + "step": 1284, + "task_loss": 0.32907983660697937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3520538343332754, + "compression/movement_sparsity/importance_threshold": -0.003492754324350909, + "compression/movement_sparsity/linear_layer_sparsity": 0.06892196319114384, + "compression/movement_sparsity/model_sparsity": 0.06655428172676907, + "compression_loss": 37.986392974853516, + "distillation_loss": 0.3914589285850525, + "epoch": 1.09, + "learning_rate": 4.224127520830818e-05, + "loss": 38.4515, + "step": 1285, + "task_loss": 0.6987112760543823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.35523080531153767, + "compression/movement_sparsity/importance_threshold": -0.0034830880355870815, + "compression/movement_sparsity/linear_layer_sparsity": 0.0699324290807745, + "compression/movement_sparsity/model_sparsity": 0.06753003500453425, + "compression_loss": 38.32862854003906, + "distillation_loss": 0.5760637521743774, + "epoch": 1.09, + "learning_rate": 4.223523729018235e-05, + "loss": 38.9482, + "step": 1286, + "task_loss": 0.5916048288345337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3584019093090525, + "compression/movement_sparsity/importance_threshold": -0.003473439597766956, + "compression/movement_sparsity/linear_layer_sparsity": 0.07135494650723005, + "compression/movement_sparsity/model_sparsity": 0.06890368458121558, + "compression_loss": 38.67019271850586, + "distillation_loss": 0.5432088971138, + "epoch": 1.09, + "learning_rate": 4.2229199372056514e-05, + "loss": 39.4064, + "step": 1287, + "task_loss": 0.1140303760766983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3615671517481689, + "compression/movement_sparsity/importance_threshold": -0.0034638089943924313, + "compression/movement_sparsity/linear_layer_sparsity": 0.07262639665390686, + "compression/movement_sparsity/model_sparsity": 0.07013145650391588, + "compression_loss": 39.01117706298828, + "distillation_loss": 0.5712504982948303, + "epoch": 1.09, + "learning_rate": 4.222316145393069e-05, + "loss": 39.5572, + "step": 1288, + "task_loss": 1.0062147378921509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3647265380512345, + "compression/movement_sparsity/importance_threshold": -0.003454196208965409, + "compression/movement_sparsity/linear_layer_sparsity": 0.07405844149030345, + "compression/movement_sparsity/model_sparsity": 0.07151430619469705, + "compression_loss": 39.35145950317383, + "distillation_loss": 0.5996578931808472, + "epoch": 1.09, + "learning_rate": 4.221712353580486e-05, + "loss": 40.1684, + "step": 1289, + "task_loss": 0.24615256488323212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36788007364059727, + "compression/movement_sparsity/importance_threshold": -0.00344460122498779, + "compression/movement_sparsity/linear_layer_sparsity": 0.07515756356630676, + "compression/movement_sparsity/model_sparsity": 0.07257567004609465, + "compression_loss": 39.6911506652832, + "distillation_loss": 0.5459069013595581, + "epoch": 1.09, + "learning_rate": 4.221108561767902e-05, + "loss": 40.3805, + "step": 1290, + "task_loss": 0.6144179105758667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.37102776393860626, + "compression/movement_sparsity/importance_threshold": -0.0034350240259614726, + "compression/movement_sparsity/linear_layer_sparsity": 0.07670222024185647, + "compression/movement_sparsity/model_sparsity": 0.07406726301291947, + "compression_loss": 40.0301628112793, + "distillation_loss": 0.4156564772129059, + "epoch": 1.09, + "learning_rate": 4.22050476995532e-05, + "loss": 40.6101, + "step": 1291, + "task_loss": 1.1754850149154663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.37416961436760904, + "compression/movement_sparsity/importance_threshold": -0.003425464595388359, + "compression/movement_sparsity/linear_layer_sparsity": 0.07797237065426095, + "compression/movement_sparsity/model_sparsity": 0.07529377985121816, + "compression_loss": 40.368595123291016, + "distillation_loss": 0.7206194996833801, + "epoch": 1.09, + "learning_rate": 4.219900978142737e-05, + "loss": 41.0608, + "step": 1292, + "task_loss": 0.6590170860290527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.377305630349954, + "compression/movement_sparsity/importance_threshold": -0.0034159229167703485, + "compression/movement_sparsity/linear_layer_sparsity": 0.07923505653772539, + "compression/movement_sparsity/model_sparsity": 0.07651308859010945, + "compression_loss": 40.70634460449219, + "distillation_loss": 0.387393057346344, + "epoch": 1.09, + "learning_rate": 4.219297186330153e-05, + "loss": 41.1842, + "step": 1293, + "task_loss": 0.3437434136867523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.38043581730798876, + "compression/movement_sparsity/importance_threshold": -0.003406398973609344, + "compression/movement_sparsity/linear_layer_sparsity": 0.08078425632114436, + "compression/movement_sparsity/model_sparsity": 0.07800906859507199, + "compression_loss": 41.04345703125, + "distillation_loss": 0.599915623664856, + "epoch": 1.09, + "learning_rate": 4.2186933945175705e-05, + "loss": 41.6076, + "step": 1294, + "task_loss": 0.8313719034194946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3835601806640625, + "compression/movement_sparsity/importance_threshold": -0.003396892749407243, + "compression/movement_sparsity/linear_layer_sparsity": 0.08215526134341297, + "compression/movement_sparsity/model_sparsity": 0.0793329753771209, + "compression_loss": 41.37990951538086, + "distillation_loss": 0.47436046600341797, + "epoch": 1.09, + "learning_rate": 4.218089602704988e-05, + "loss": 41.9317, + "step": 1295, + "task_loss": 0.5215071439743042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3866787258405228, + "compression/movement_sparsity/importance_threshold": -0.003387404227665948, + "compression/movement_sparsity/linear_layer_sparsity": 0.08378419803581295, + "compression/movement_sparsity/model_sparsity": 0.08090595308294155, + "compression_loss": 41.715736389160156, + "distillation_loss": 0.6345131993293762, + "epoch": 1.1, + "learning_rate": 4.217485810892404e-05, + "loss": 42.2746, + "step": 1296, + "task_loss": 0.2941673994064331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3897914582597178, + "compression/movement_sparsity/importance_threshold": -0.003377933391887359, + "compression/movement_sparsity/linear_layer_sparsity": 0.0852309215225693, + "compression/movement_sparsity/model_sparsity": 0.0823029771672858, + "compression_loss": 42.05088806152344, + "distillation_loss": 0.7908411026000977, + "epoch": 1.1, + "learning_rate": 4.216882019079821e-05, + "loss": 42.5886, + "step": 1297, + "task_loss": 0.5576940178871155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3928983833439961, + "compression/movement_sparsity/importance_threshold": -0.003368480225573375, + "compression/movement_sparsity/linear_layer_sparsity": 0.08685205980933543, + "compression/movement_sparsity/model_sparsity": 0.0838684243666968, + "compression_loss": 42.3853759765625, + "distillation_loss": 0.7869074940681458, + "epoch": 1.1, + "learning_rate": 4.216278227267239e-05, + "loss": 42.8715, + "step": 1298, + "task_loss": 0.4372589588165283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.39599950651570576, + "compression/movement_sparsity/importance_threshold": -0.003359044712225898, + "compression/movement_sparsity/linear_layer_sparsity": 0.08861368863918728, + "compression/movement_sparsity/model_sparsity": 0.08556953582683907, + "compression_loss": 42.71921920776367, + "distillation_loss": 0.6729541420936584, + "epoch": 1.1, + "learning_rate": 4.2156744354546554e-05, + "loss": 43.2173, + "step": 1299, + "task_loss": 0.7337606549263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3990948331971949, + "compression/movement_sparsity/importance_threshold": -0.003349626835346828, + "compression/movement_sparsity/linear_layer_sparsity": 0.09031417033740244, + "compression/movement_sparsity/model_sparsity": 0.08721160074742694, + "compression_loss": 43.05238723754883, + "distillation_loss": 0.481242835521698, + "epoch": 1.1, + "learning_rate": 4.215070643642072e-05, + "loss": 43.4554, + "step": 1300, + "task_loss": 0.44292038679122925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40218436881081115, + "compression/movement_sparsity/importance_threshold": -0.0033402265784380665, + "compression/movement_sparsity/linear_layer_sparsity": 0.0917581751139378, + "compression/movement_sparsity/model_sparsity": 0.08860599951761004, + "compression_loss": 43.384883880615234, + "distillation_loss": 0.48726382851600647, + "epoch": 1.1, + "learning_rate": 4.2144668518294896e-05, + "loss": 43.8635, + "step": 1301, + "task_loss": 0.5337143540382385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40526811877890373, + "compression/movement_sparsity/importance_threshold": -0.003330843925001512, + "compression/movement_sparsity/linear_layer_sparsity": 0.09375038157336435, + "compression/movement_sparsity/model_sparsity": 0.09052976755641136, + "compression_loss": 43.716739654541016, + "distillation_loss": 0.8898061513900757, + "epoch": 1.1, + "learning_rate": 4.213863060016906e-05, + "loss": 44.3364, + "step": 1302, + "task_loss": 1.9387022256851196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40834608852382054, + "compression/movement_sparsity/importance_threshold": -0.0033214788585390655, + "compression/movement_sparsity/linear_layer_sparsity": 0.09538371828362195, + "compression/movement_sparsity/model_sparsity": 0.09210699412594017, + "compression_loss": 44.0479850769043, + "distillation_loss": 0.47995662689208984, + "epoch": 1.1, + "learning_rate": 4.213259268204323e-05, + "loss": 44.602, + "step": 1303, + "task_loss": 0.4069124758243561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.411418283467909, + "compression/movement_sparsity/importance_threshold": -0.00331213136255263, + "compression/movement_sparsity/linear_layer_sparsity": 0.09698178330601268, + "compression/movement_sparsity/model_sparsity": 0.09365016069858872, + "compression_loss": 44.37854766845703, + "distillation_loss": 0.40206286311149597, + "epoch": 1.1, + "learning_rate": 4.2126554763917404e-05, + "loss": 44.9244, + "step": 1304, + "task_loss": 1.0917549133300781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4144847090335183, + "compression/movement_sparsity/importance_threshold": -0.0033028014205441023, + "compression/movement_sparsity/linear_layer_sparsity": 0.09879221775399813, + "compression/movement_sparsity/model_sparsity": 0.09539840115373806, + "compression_loss": 44.708492279052734, + "distillation_loss": 0.38143932819366455, + "epoch": 1.1, + "learning_rate": 4.212051684579157e-05, + "loss": 45.2843, + "step": 1305, + "task_loss": 0.8777618408203125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4175453706429964, + "compression/movement_sparsity/importance_threshold": -0.0032934890160153845, + "compression/movement_sparsity/linear_layer_sparsity": 0.10060379692207662, + "compression/movement_sparsity/model_sparsity": 0.09714774700432367, + "compression_loss": 45.03784942626953, + "distillation_loss": 0.23496291041374207, + "epoch": 1.1, + "learning_rate": 4.211447892766574e-05, + "loss": 45.3763, + "step": 1306, + "task_loss": 0.5907947421073914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4206002737186907, + "compression/movement_sparsity/importance_threshold": -0.0032841941324683785, + "compression/movement_sparsity/linear_layer_sparsity": 0.10225726162730356, + "compression/movement_sparsity/model_sparsity": 0.09874441011027367, + "compression_loss": 45.36653137207031, + "distillation_loss": 0.6294994354248047, + "epoch": 1.1, + "learning_rate": 4.210844100953991e-05, + "loss": 45.9615, + "step": 1307, + "task_loss": 1.5922635793685913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42364942368295055, + "compression/movement_sparsity/importance_threshold": -0.0032749167534049817, + "compression/movement_sparsity/linear_layer_sparsity": 0.10414325952559747, + "compression/movement_sparsity/model_sparsity": 0.10056561817875301, + "compression_loss": 45.694557189941406, + "distillation_loss": 0.2052983194589615, + "epoch": 1.11, + "learning_rate": 4.2102403091414086e-05, + "loss": 46.1253, + "step": 1308, + "task_loss": 0.5349974632263184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42669282595812374, + "compression/movement_sparsity/importance_threshold": -0.0032656568623270956, + "compression/movement_sparsity/linear_layer_sparsity": 0.10596450919362864, + "compression/movement_sparsity/model_sparsity": 0.102324302317868, + "compression_loss": 46.02192687988281, + "distillation_loss": 0.433124303817749, + "epoch": 1.11, + "learning_rate": 4.209636517328825e-05, + "loss": 46.5536, + "step": 1309, + "task_loss": 0.5030676126480103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42973048596655783, + "compression/movement_sparsity/importance_threshold": -0.0032564144427366234, + "compression/movement_sparsity/linear_layer_sparsity": 0.10777036476124191, + "compression/movement_sparsity/model_sparsity": 0.10406812119127222, + "compression_loss": 46.34867858886719, + "distillation_loss": 0.4955660402774811, + "epoch": 1.11, + "learning_rate": 4.209032725516242e-05, + "loss": 46.8545, + "step": 1310, + "task_loss": 0.5676338076591492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4327624091306019, + "compression/movement_sparsity/importance_threshold": -0.003247189478135462, + "compression/movement_sparsity/linear_layer_sparsity": 0.10968913027219918, + "compression/movement_sparsity/model_sparsity": 0.10592097120411496, + "compression_loss": 46.674800872802734, + "distillation_loss": 0.5332227945327759, + "epoch": 1.11, + "learning_rate": 4.2084289337036595e-05, + "loss": 47.2699, + "step": 1311, + "task_loss": 0.4865185022354126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4357886008726038, + "compression/movement_sparsity/importance_threshold": -0.003237981952025513, + "compression/movement_sparsity/linear_layer_sparsity": 0.11165584286122024, + "compression/movement_sparsity/model_sparsity": 0.10782012116538756, + "compression_loss": 47.000335693359375, + "distillation_loss": 0.5922881364822388, + "epoch": 1.11, + "learning_rate": 4.207825141891076e-05, + "loss": 47.4329, + "step": 1312, + "task_loss": 0.7178448438644409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4388090666149118, + "compression/movement_sparsity/importance_threshold": -0.003228791847908677, + "compression/movement_sparsity/linear_layer_sparsity": 0.1137394599897433, + "compression/movement_sparsity/model_sparsity": 0.10983215963558982, + "compression_loss": 47.32521438598633, + "distillation_loss": 0.5975414514541626, + "epoch": 1.11, + "learning_rate": 4.207221350078493e-05, + "loss": 47.7347, + "step": 1313, + "task_loss": 1.5485409498214722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4418238117798735, + "compression/movement_sparsity/importance_threshold": -0.0032196191492868554, + "compression/movement_sparsity/linear_layer_sparsity": 0.11582783486115308, + "compression/movement_sparsity/model_sparsity": 0.1118487924055741, + "compression_loss": 47.64946365356445, + "distillation_loss": 0.7130229473114014, + "epoch": 1.11, + "learning_rate": 4.20661755826591e-05, + "loss": 48.2814, + "step": 1314, + "task_loss": 0.5504848957061768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.44483284178983795, + "compression/movement_sparsity/importance_threshold": -0.003210463839661948, + "compression/movement_sparsity/linear_layer_sparsity": 0.11770981431495371, + "compression/movement_sparsity/model_sparsity": 0.11366612007549068, + "compression_loss": 47.97305679321289, + "distillation_loss": 0.48751258850097656, + "epoch": 1.11, + "learning_rate": 4.206013766453327e-05, + "loss": 48.421, + "step": 1315, + "task_loss": 1.4994933605194092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.44783616206715304, + "compression/movement_sparsity/importance_threshold": -0.003201325902535854, + "compression/movement_sparsity/linear_layer_sparsity": 0.11952941844785113, + "compression/movement_sparsity/model_sparsity": 0.11542321520866601, + "compression_loss": 48.29603576660156, + "distillation_loss": 0.5306200981140137, + "epoch": 1.11, + "learning_rate": 4.205409974640744e-05, + "loss": 48.7778, + "step": 1316, + "task_loss": 0.36964669823646545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.450833778034167, + "compression/movement_sparsity/importance_threshold": -0.0031922053214104747, + "compression/movement_sparsity/linear_layer_sparsity": 0.1215770245901139, + "compression/movement_sparsity/model_sparsity": 0.11740047978076877, + "compression_loss": 48.618343353271484, + "distillation_loss": 0.4229336380958557, + "epoch": 1.11, + "learning_rate": 4.204806182828161e-05, + "loss": 49.0859, + "step": 1317, + "task_loss": 0.42890480160713196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45382569511322746, + "compression/movement_sparsity/importance_threshold": -0.003183102079787712, + "compression/movement_sparsity/linear_layer_sparsity": 0.1235310856372733, + "compression/movement_sparsity/model_sparsity": 0.11928741281956334, + "compression_loss": 48.940059661865234, + "distillation_loss": 0.4254857003688812, + "epoch": 1.11, + "learning_rate": 4.204202391015578e-05, + "loss": 49.4511, + "step": 1318, + "task_loss": 0.2547108232975006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4568119187266834, + "compression/movement_sparsity/importance_threshold": -0.003174016161169465, + "compression/movement_sparsity/linear_layer_sparsity": 0.12543979907491354, + "compression/movement_sparsity/model_sparsity": 0.12113055607873129, + "compression_loss": 49.261146545410156, + "distillation_loss": 0.5012710690498352, + "epoch": 1.11, + "learning_rate": 4.203598599202995e-05, + "loss": 49.7314, + "step": 1319, + "task_loss": 0.288612425327301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4597924542968828, + "compression/movement_sparsity/importance_threshold": -0.003164947549057633, + "compression/movement_sparsity/linear_layer_sparsity": 0.12745320670439664, + "compression/movement_sparsity/model_sparsity": 0.1230747969621753, + "compression_loss": 49.581607818603516, + "distillation_loss": 0.6273993253707886, + "epoch": 1.12, + "learning_rate": 4.202994807390412e-05, + "loss": 50.4047, + "step": 1320, + "task_loss": 0.9354586005210876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46276730724617354, + "compression/movement_sparsity/importance_threshold": -0.0031558962269541196, + "compression/movement_sparsity/linear_layer_sparsity": 0.12927622114923792, + "compression/movement_sparsity/model_sparsity": 0.12483518525258787, + "compression_loss": 49.90154266357422, + "distillation_loss": 0.5354970693588257, + "epoch": 1.12, + "learning_rate": 4.202391015577829e-05, + "loss": 50.5532, + "step": 1321, + "task_loss": 0.5257239937782288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4657364829969042, + "compression/movement_sparsity/importance_threshold": -0.003146862178360822, + "compression/movement_sparsity/linear_layer_sparsity": 0.13102729709073205, + "compression/movement_sparsity/model_sparsity": 0.12652610634855196, + "compression_loss": 50.220909118652344, + "distillation_loss": 0.7406328320503235, + "epoch": 1.12, + "learning_rate": 4.201787223765246e-05, + "loss": 51.2674, + "step": 1322, + "task_loss": 1.5844182968139648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46869998697142295, + "compression/movement_sparsity/importance_threshold": -0.0031378453867796424, + "compression/movement_sparsity/linear_layer_sparsity": 0.13292184461722087, + "compression/movement_sparsity/model_sparsity": 0.128355570339196, + "compression_loss": 50.539695739746094, + "distillation_loss": 0.627805233001709, + "epoch": 1.12, + "learning_rate": 4.201183431952663e-05, + "loss": 51.1895, + "step": 1323, + "task_loss": 0.5349506139755249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4716578245920773, + "compression/movement_sparsity/importance_threshold": -0.0031288458357124826, + "compression/movement_sparsity/linear_layer_sparsity": 0.13435700166137043, + "compression/movement_sparsity/model_sparsity": 0.12974142532381955, + "compression_loss": 50.85783386230469, + "distillation_loss": 0.4615139663219452, + "epoch": 1.12, + "learning_rate": 4.20057964014008e-05, + "loss": 51.3966, + "step": 1324, + "task_loss": 0.758313775062561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.47461000128121655, + "compression/movement_sparsity/importance_threshold": -0.003119863508661239, + "compression/movement_sparsity/linear_layer_sparsity": 0.13604071797988956, + "compression/movement_sparsity/model_sparsity": 0.13136730080708028, + "compression_loss": 51.1754035949707, + "distillation_loss": 1.027340292930603, + "epoch": 1.12, + "learning_rate": 4.199975848327497e-05, + "loss": 51.8103, + "step": 1325, + "task_loss": 0.8412054181098938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4775565224611882, + "compression/movement_sparsity/importance_threshold": -0.0031108983891278154, + "compression/movement_sparsity/linear_layer_sparsity": 0.13766493270190572, + "compression/movement_sparsity/model_sparsity": 0.1329357187567263, + "compression_loss": 51.4923095703125, + "distillation_loss": 0.5421846508979797, + "epoch": 1.12, + "learning_rate": 4.1993720565149136e-05, + "loss": 52.0206, + "step": 1326, + "task_loss": 0.5908591747283936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48049739355434024, + "compression/movement_sparsity/importance_threshold": -0.0031019504606141124, + "compression/movement_sparsity/linear_layer_sparsity": 0.13948147232372074, + "compression/movement_sparsity/model_sparsity": 0.13468985465420244, + "compression_loss": 51.80863952636719, + "distillation_loss": 1.5145933628082275, + "epoch": 1.12, + "learning_rate": 4.198768264702331e-05, + "loss": 52.7521, + "step": 1327, + "task_loss": 0.8470127582550049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4834326199830216, + "compression/movement_sparsity/importance_threshold": -0.003093019706622028, + "compression/movement_sparsity/linear_layer_sparsity": 0.1410719058788245, + "compression/movement_sparsity/model_sparsity": 0.13622565192394248, + "compression_loss": 52.12431335449219, + "distillation_loss": 0.4998517334461212, + "epoch": 1.12, + "learning_rate": 4.198164472889748e-05, + "loss": 52.6055, + "step": 1328, + "task_loss": 0.38575848937034607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48636220716958023, + "compression/movement_sparsity/importance_threshold": -0.003084106110653464, + "compression/movement_sparsity/linear_layer_sparsity": 0.14282931355333328, + "compression/movement_sparsity/model_sparsity": 0.1379226872384135, + "compression_loss": 52.43937301635742, + "distillation_loss": 0.7924885749816895, + "epoch": 1.12, + "learning_rate": 4.197560681077165e-05, + "loss": 53.2151, + "step": 1329, + "task_loss": 0.2703063488006592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.489286160536364, + "compression/movement_sparsity/importance_threshold": -0.003075209656210321, + "compression/movement_sparsity/linear_layer_sparsity": 0.1446762121059492, + "compression/movement_sparsity/model_sparsity": 0.13970613914402252, + "compression_loss": 52.753841400146484, + "distillation_loss": 0.5064437389373779, + "epoch": 1.12, + "learning_rate": 4.196956889264582e-05, + "loss": 53.269, + "step": 1330, + "task_loss": 1.033895492553711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49220448550572093, + "compression/movement_sparsity/importance_threshold": -0.003066330326794501, + "compression/movement_sparsity/linear_layer_sparsity": 0.1465540061769096, + "compression/movement_sparsity/model_sparsity": 0.14151942521187524, + "compression_loss": 53.06763458251953, + "distillation_loss": 0.5594974756240845, + "epoch": 1.13, + "learning_rate": 4.1963530974519986e-05, + "loss": 53.6294, + "step": 1331, + "task_loss": 1.134873628616333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4951171875, + "compression/movement_sparsity/importance_threshold": -0.003057468105907901, + "compression/movement_sparsity/linear_layer_sparsity": 0.14849322163536238, + "compression/movement_sparsity/model_sparsity": 0.1433920226536056, + "compression_loss": 53.380760192871094, + "distillation_loss": 0.6891230344772339, + "epoch": 1.13, + "learning_rate": 4.195749305639416e-05, + "loss": 54.2164, + "step": 1332, + "task_loss": 1.2063493728637695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4980242719415489, + "compression/movement_sparsity/importance_threshold": -0.003048622977052424, + "compression/movement_sparsity/linear_layer_sparsity": 0.1505457643830264, + "compression/movement_sparsity/model_sparsity": 0.14537405424352728, + "compression_loss": 53.69331741333008, + "distillation_loss": 0.3187189996242523, + "epoch": 1.13, + "learning_rate": 4.195145513826833e-05, + "loss": 54.1606, + "step": 1333, + "task_loss": 1.4558395147323608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5009257442527161, + "compression/movement_sparsity/importance_threshold": -0.003039794923729969, + "compression/movement_sparsity/linear_layer_sparsity": 0.1525055371064834, + "compression/movement_sparsity/model_sparsity": 0.14726650274496747, + "compression_loss": 54.005271911621094, + "distillation_loss": 0.35164231061935425, + "epoch": 1.13, + "learning_rate": 4.1945417220142494e-05, + "loss": 54.5363, + "step": 1334, + "task_loss": 0.9146254062652588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5038216098558489, + "compression/movement_sparsity/importance_threshold": -0.003030983929442439, + "compression/movement_sparsity/linear_layer_sparsity": 0.15439517187590623, + "compression/movement_sparsity/model_sparsity": 0.14909122274686412, + "compression_loss": 54.31656265258789, + "distillation_loss": 0.684345006942749, + "epoch": 1.13, + "learning_rate": 4.193937930201667e-05, + "loss": 54.9945, + "step": 1335, + "task_loss": 0.17185698449611664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5067118741732968, + "compression/movement_sparsity/importance_threshold": -0.003022189977691731, + "compression/movement_sparsity/linear_layer_sparsity": 0.1564990720135779, + "compression/movement_sparsity/model_sparsity": 0.1511228474424529, + "compression_loss": 54.62724685668945, + "distillation_loss": 0.37548866868019104, + "epoch": 1.13, + "learning_rate": 4.1933341383890835e-05, + "loss": 55.2774, + "step": 1336, + "task_loss": 0.45851272344589233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5095965426274073, + "compression/movement_sparsity/importance_threshold": -0.003013413051979748, + "compression/movement_sparsity/linear_layer_sparsity": 0.15842554053682795, + "compression/movement_sparsity/model_sparsity": 0.15298313584541892, + "compression_loss": 54.93730545043945, + "distillation_loss": 0.4902113378047943, + "epoch": 1.13, + "learning_rate": 4.1927303465765e-05, + "loss": 55.5883, + "step": 1337, + "task_loss": 0.7764456272125244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5124756206405283, + "compression/movement_sparsity/importance_threshold": -0.00300465313580839, + "compression/movement_sparsity/linear_layer_sparsity": 0.16029244784273683, + "compression/movement_sparsity/model_sparsity": 0.1547859091420912, + "compression_loss": 55.24681854248047, + "distillation_loss": 0.8601358532905579, + "epoch": 1.13, + "learning_rate": 4.1921265547639176e-05, + "loss": 55.9548, + "step": 1338, + "task_loss": 1.9680184125900269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5153491136350088, + "compression/movement_sparsity/importance_threshold": -0.0029959102126795563, + "compression/movement_sparsity/linear_layer_sparsity": 0.16236557170374033, + "compression/movement_sparsity/model_sparsity": 0.15678781482079426, + "compression_loss": 55.555747985839844, + "distillation_loss": 1.0419814586639404, + "epoch": 1.13, + "learning_rate": 4.191522762951335e-05, + "loss": 56.1963, + "step": 1339, + "task_loss": 1.3514772653579712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5182170270331966, + "compression/movement_sparsity/importance_threshold": -0.002987184266095148, + "compression/movement_sparsity/linear_layer_sparsity": 0.16425079453113792, + "compression/movement_sparsity/model_sparsity": 0.15860827444444697, + "compression_loss": 55.86402893066406, + "distillation_loss": 0.7991650104522705, + "epoch": 1.13, + "learning_rate": 4.190918971138752e-05, + "loss": 56.3824, + "step": 1340, + "task_loss": 1.3947752714157104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5210793662574396, + "compression/movement_sparsity/importance_threshold": -0.0029784752795570663, + "compression/movement_sparsity/linear_layer_sparsity": 0.16629516922397136, + "compression/movement_sparsity/model_sparsity": 0.1605824185773494, + "compression_loss": 56.17171096801758, + "distillation_loss": 0.41190028190612793, + "epoch": 1.13, + "learning_rate": 4.1903151793261685e-05, + "loss": 56.7545, + "step": 1341, + "task_loss": 0.29174506664276123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5239361367300863, + "compression/movement_sparsity/importance_threshold": -0.0029697832365672104, + "compression/movement_sparsity/linear_layer_sparsity": 0.16845872597232525, + "compression/movement_sparsity/model_sparsity": 0.16267165049551827, + "compression_loss": 56.478721618652344, + "distillation_loss": 0.28209346532821655, + "epoch": 1.13, + "learning_rate": 4.189711387513586e-05, + "loss": 57.1874, + "step": 1342, + "task_loss": 0.03726118057966232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5267873438734849, + "compression/movement_sparsity/importance_threshold": -0.002961108120627482, + "compression/movement_sparsity/linear_layer_sparsity": 0.1705467908153765, + "compression/movement_sparsity/model_sparsity": 0.1646879838875719, + "compression_loss": 56.78512191772461, + "distillation_loss": 0.6510779857635498, + "epoch": 1.14, + "learning_rate": 4.1891075957010026e-05, + "loss": 57.3976, + "step": 1343, + "task_loss": 0.38953909277915955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5296329931099835, + "compression/movement_sparsity/importance_threshold": -0.0029524499152397813, + "compression/movement_sparsity/linear_layer_sparsity": 0.17251473159366404, + "compression/movement_sparsity/model_sparsity": 0.1665883198460313, + "compression_loss": 57.0909309387207, + "distillation_loss": 0.7534204721450806, + "epoch": 1.14, + "learning_rate": 4.188503803888419e-05, + "loss": 57.9751, + "step": 1344, + "task_loss": 0.9102391004562378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5324730898619304, + "compression/movement_sparsity/importance_threshold": -0.0029438086039060077, + "compression/movement_sparsity/linear_layer_sparsity": 0.17450223993104205, + "compression/movement_sparsity/model_sparsity": 0.16850755115772958, + "compression_loss": 57.39614486694336, + "distillation_loss": 0.651016116142273, + "epoch": 1.14, + "learning_rate": 4.187900012075837e-05, + "loss": 58.0904, + "step": 1345, + "task_loss": 1.4844597578048706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5353076395516739, + "compression/movement_sparsity/importance_threshold": -0.002935184170128062, + "compression/movement_sparsity/linear_layer_sparsity": 0.17653915009493545, + "compression/movement_sparsity/model_sparsity": 0.17047448719122463, + "compression_loss": 57.70078659057617, + "distillation_loss": 0.8724663257598877, + "epoch": 1.14, + "learning_rate": 4.1872962202632534e-05, + "loss": 58.6078, + "step": 1346, + "task_loss": 0.7334779500961304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5381366476015617, + "compression/movement_sparsity/importance_threshold": -0.0029265765974078473, + "compression/movement_sparsity/linear_layer_sparsity": 0.17854991055920338, + "compression/movement_sparsity/model_sparsity": 0.17241617184772223, + "compression_loss": 58.00483703613281, + "distillation_loss": 0.44548577070236206, + "epoch": 1.14, + "learning_rate": 4.18669242845067e-05, + "loss": 58.6765, + "step": 1347, + "task_loss": 0.40221214294433594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5409601194339426, + "compression/movement_sparsity/importance_threshold": -0.002917985869247259, + "compression/movement_sparsity/linear_layer_sparsity": 0.18062875802067213, + "compression/movement_sparsity/model_sparsity": 0.1744236045036067, + "compression_loss": 58.308311462402344, + "distillation_loss": 0.6168889999389648, + "epoch": 1.14, + "learning_rate": 4.1860886366380875e-05, + "loss": 59.0554, + "step": 1348, + "task_loss": 0.8127673864364624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5437780604711646, + "compression/movement_sparsity/importance_threshold": -0.0029094119691482016, + "compression/movement_sparsity/linear_layer_sparsity": 0.18277366537084352, + "compression/movement_sparsity/model_sparsity": 0.1764948276877929, + "compression_loss": 58.61111831665039, + "distillation_loss": 0.3418865203857422, + "epoch": 1.14, + "learning_rate": 4.185484844825505e-05, + "loss": 59.0452, + "step": 1349, + "task_loss": 0.024086998775601387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5465904761355758, + "compression/movement_sparsity/importance_threshold": -0.0029008548806125738, + "compression/movement_sparsity/linear_layer_sparsity": 0.18469504227451303, + "compression/movement_sparsity/model_sparsity": 0.17835019938397464, + "compression_loss": 58.913307189941406, + "distillation_loss": 0.7119177579879761, + "epoch": 1.14, + "learning_rate": 4.184881053012921e-05, + "loss": 59.7898, + "step": 1350, + "task_loss": 1.1302666664123535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.549397371849524, + "compression/movement_sparsity/importance_threshold": -0.002892314587142278, + "compression/movement_sparsity/linear_layer_sparsity": 0.18644762066112927, + "compression/movement_sparsity/model_sparsity": 0.18004257131144885, + "compression_loss": 59.214942932128906, + "distillation_loss": 0.4487518072128296, + "epoch": 1.14, + "learning_rate": 4.1842772612003383e-05, + "loss": 60.0115, + "step": 1351, + "task_loss": 1.062807321548462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5521987530353583, + "compression/movement_sparsity/importance_threshold": -0.002883791072239212, + "compression/movement_sparsity/linear_layer_sparsity": 0.18873137301925266, + "compression/movement_sparsity/model_sparsity": 0.1822478697504267, + "compression_loss": 59.51594924926758, + "distillation_loss": 0.7475280165672302, + "epoch": 1.14, + "learning_rate": 4.183673469387756e-05, + "loss": 60.3057, + "step": 1352, + "task_loss": 1.2702032327651978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5549946251154263, + "compression/movement_sparsity/importance_threshold": -0.002875284319405278, + "compression/movement_sparsity/linear_layer_sparsity": 0.19094346112988453, + "compression/movement_sparsity/model_sparsity": 0.18438396582927935, + "compression_loss": 59.81637954711914, + "distillation_loss": 0.8748681545257568, + "epoch": 1.14, + "learning_rate": 4.183069677575172e-05, + "loss": 60.6006, + "step": 1353, + "task_loss": 0.9828934073448181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5577849935120762, + "compression/movement_sparsity/importance_threshold": -0.0028667943121423755, + "compression/movement_sparsity/linear_layer_sparsity": 0.19306390008806712, + "compression/movement_sparsity/model_sparsity": 0.18643156118601514, + "compression_loss": 60.11622619628906, + "distillation_loss": 0.756619930267334, + "epoch": 1.14, + "learning_rate": 4.182465885762589e-05, + "loss": 60.8456, + "step": 1354, + "task_loss": 0.6199924945831299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5605698636476562, + "compression/movement_sparsity/importance_threshold": -0.0028583210339524065, + "compression/movement_sparsity/linear_layer_sparsity": 0.19529774980463444, + "compression/movement_sparsity/model_sparsity": 0.18858867129269283, + "compression_loss": 60.41544723510742, + "distillation_loss": 0.7377138137817383, + "epoch": 1.15, + "learning_rate": 4.1818620939500066e-05, + "loss": 61.1416, + "step": 1355, + "task_loss": 0.194289892911911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5633492409445147, + "compression/movement_sparsity/importance_threshold": -0.0028498644683372697, + "compression/movement_sparsity/linear_layer_sparsity": 0.19737585796770973, + "compression/movement_sparsity/model_sparsity": 0.190595390047358, + "compression_loss": 60.71408462524414, + "distillation_loss": 0.6436585783958435, + "epoch": 1.15, + "learning_rate": 4.181258302137423e-05, + "loss": 61.5809, + "step": 1356, + "task_loss": 0.6387341618537903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5661231308249999, + "compression/movement_sparsity/importance_threshold": -0.002841424598798866, + "compression/movement_sparsity/linear_layer_sparsity": 0.19943336116911026, + "compression/movement_sparsity/model_sparsity": 0.19258221168417025, + "compression_loss": 61.012107849121094, + "distillation_loss": 0.579427182674408, + "epoch": 1.15, + "learning_rate": 4.18065451032484e-05, + "loss": 61.7557, + "step": 1357, + "task_loss": 0.4528934061527252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5688915387114595, + "compression/movement_sparsity/importance_threshold": -0.002833001408839097, + "compression/movement_sparsity/linear_layer_sparsity": 0.2011861780390792, + "compression/movement_sparsity/model_sparsity": 0.19427481390236034, + "compression_loss": 61.30952072143555, + "distillation_loss": 0.8669440150260925, + "epoch": 1.15, + "learning_rate": 4.1800507185122574e-05, + "loss": 62.0841, + "step": 1358, + "task_loss": 1.4366141557693481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5716544700262425, + "compression/movement_sparsity/importance_threshold": -0.0028245948819598616, + "compression/movement_sparsity/linear_layer_sparsity": 0.2033785316522737, + "compression/movement_sparsity/model_sparsity": 0.19639185342447305, + "compression_loss": 61.60634994506836, + "distillation_loss": 0.8479097485542297, + "epoch": 1.15, + "learning_rate": 4.179446926699674e-05, + "loss": 62.3573, + "step": 1359, + "task_loss": 0.36244797706604004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5744119301916966, + "compression/movement_sparsity/importance_threshold": -0.0028162050016630608, + "compression/movement_sparsity/linear_layer_sparsity": 0.20544922298307952, + "compression/movement_sparsity/model_sparsity": 0.198391410137874, + "compression_loss": 61.90259552001953, + "distillation_loss": 1.0806055068969727, + "epoch": 1.15, + "learning_rate": 4.178843134887091e-05, + "loss": 62.7493, + "step": 1360, + "task_loss": 1.2633005380630493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5771639246301696, + "compression/movement_sparsity/importance_threshold": -0.0028078317514505964, + "compression/movement_sparsity/linear_layer_sparsity": 0.20752065361227873, + "compression/movement_sparsity/model_sparsity": 0.20039168075249425, + "compression_loss": 62.19822692871094, + "distillation_loss": 0.8248847723007202, + "epoch": 1.15, + "learning_rate": 4.178239343074508e-05, + "loss": 63.0963, + "step": 1361, + "task_loss": 1.3339134454727173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5799104587640107, + "compression/movement_sparsity/importance_threshold": -0.002799475114824366, + "compression/movement_sparsity/linear_layer_sparsity": 0.2099370623353511, + "compression/movement_sparsity/model_sparsity": 0.20272507840218637, + "compression_loss": 62.493255615234375, + "distillation_loss": 0.9832310676574707, + "epoch": 1.15, + "learning_rate": 4.177635551261925e-05, + "loss": 63.5547, + "step": 1362, + "task_loss": 0.8253710865974426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5826515380155675, + "compression/movement_sparsity/importance_threshold": -0.002791135075286273, + "compression/movement_sparsity/linear_layer_sparsity": 0.2119308547090732, + "compression/movement_sparsity/model_sparsity": 0.20465037787424836, + "compression_loss": 62.78776931762695, + "distillation_loss": 0.9118959307670593, + "epoch": 1.15, + "learning_rate": 4.177031759449342e-05, + "loss": 63.5636, + "step": 1363, + "task_loss": 0.42578306794166565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.585387167807188, + "compression/movement_sparsity/importance_threshold": -0.0027828116163382166, + "compression/movement_sparsity/linear_layer_sparsity": 0.21404406762166847, + "compression/movement_sparsity/model_sparsity": 0.20669099542229266, + "compression_loss": 63.08169937133789, + "distillation_loss": 0.632866382598877, + "epoch": 1.15, + "learning_rate": 4.176427967636759e-05, + "loss": 63.7584, + "step": 1364, + "task_loss": 0.8126760125160217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5881173535612209, + "compression/movement_sparsity/importance_threshold": -0.002774504721482097, + "compression/movement_sparsity/linear_layer_sparsity": 0.21595244718261492, + "compression/movement_sparsity/model_sparsity": 0.20853381627445836, + "compression_loss": 63.37501907348633, + "distillation_loss": 0.6368722915649414, + "epoch": 1.15, + "learning_rate": 4.1758241758241765e-05, + "loss": 64.1358, + "step": 1365, + "task_loss": 1.117645025253296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5908421007000141, + "compression/movement_sparsity/importance_threshold": -0.0027662143742198143, + "compression/movement_sparsity/linear_layer_sparsity": 0.21804513860270885, + "compression/movement_sparsity/model_sparsity": 0.21055461730640027, + "compression_loss": 63.667762756347656, + "distillation_loss": 0.7271023392677307, + "epoch": 1.15, + "learning_rate": 4.1752203840115925e-05, + "loss": 64.5768, + "step": 1366, + "task_loss": 0.9656323194503784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5935614146459156, + "compression/movement_sparsity/importance_threshold": -0.0027579405580532707, + "compression/movement_sparsity/linear_layer_sparsity": 0.2199053802989093, + "compression/movement_sparsity/model_sparsity": 0.2123509539775634, + "compression_loss": 63.9599494934082, + "distillation_loss": 1.3375957012176514, + "epoch": 1.16, + "learning_rate": 4.17461659219901e-05, + "loss": 64.8956, + "step": 1367, + "task_loss": 1.154859185218811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.596275300821274, + "compression/movement_sparsity/importance_threshold": -0.0027496832564843643, + "compression/movement_sparsity/linear_layer_sparsity": 0.22189633472073408, + "compression/movement_sparsity/model_sparsity": 0.21427351299010627, + "compression_loss": 64.25151824951172, + "distillation_loss": 0.8366067409515381, + "epoch": 1.16, + "learning_rate": 4.174012800386427e-05, + "loss": 65.0679, + "step": 1368, + "task_loss": 0.6697847843170166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5989837646484375, + "compression/movement_sparsity/importance_threshold": -0.0027414424530149972, + "compression/movement_sparsity/linear_layer_sparsity": 0.22413169880659115, + "compression/movement_sparsity/model_sparsity": 0.21643208544282988, + "compression_loss": 64.54248046875, + "distillation_loss": 0.4663401246070862, + "epoch": 1.16, + "learning_rate": 4.173409008573844e-05, + "loss": 65.3785, + "step": 1369, + "task_loss": 0.5330495834350586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6016868115497541, + "compression/movement_sparsity/importance_threshold": -0.0027332181311470693, + "compression/movement_sparsity/linear_layer_sparsity": 0.22613675951872914, + "compression/movement_sparsity/model_sparsity": 0.2183682661512177, + "compression_loss": 64.83292388916016, + "distillation_loss": 0.6931779384613037, + "epoch": 1.16, + "learning_rate": 4.172805216761261e-05, + "loss": 65.659, + "step": 1370, + "task_loss": 0.9911800622940063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6043844469475719, + "compression/movement_sparsity/importance_threshold": -0.0027250102743824815, + "compression/movement_sparsity/linear_layer_sparsity": 0.22821125465901076, + "compression/movement_sparsity/model_sparsity": 0.22037149600153713, + "compression_loss": 65.12277221679688, + "distillation_loss": 1.086794376373291, + "epoch": 1.16, + "learning_rate": 4.172201424948678e-05, + "loss": 65.8935, + "step": 1371, + "task_loss": 1.3339197635650635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.607076676264239, + "compression/movement_sparsity/importance_threshold": -0.0027168188662231346, + "compression/movement_sparsity/linear_layer_sparsity": 0.2302616152839974, + "compression/movement_sparsity/model_sparsity": 0.22235142043140843, + "compression_loss": 65.41204071044922, + "distillation_loss": 0.8879855275154114, + "epoch": 1.16, + "learning_rate": 4.171597633136095e-05, + "loss": 66.1692, + "step": 1372, + "task_loss": 0.9749023914337158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6097635049221041, + "compression/movement_sparsity/importance_threshold": -0.0027086438901709276, + "compression/movement_sparsity/linear_layer_sparsity": 0.23222244925837401, + "compression/movement_sparsity/model_sparsity": 0.2242448937265343, + "compression_loss": 65.70072937011719, + "distillation_loss": 0.7153295278549194, + "epoch": 1.16, + "learning_rate": 4.1709938413235116e-05, + "loss": 66.5496, + "step": 1373, + "task_loss": 0.6534342765808105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6124449383435151, + "compression/movement_sparsity/importance_threshold": -0.0027004853297277623, + "compression/movement_sparsity/linear_layer_sparsity": 0.23412299464118366, + "compression/movement_sparsity/model_sparsity": 0.226080149528683, + "compression_loss": 65.98876953125, + "distillation_loss": 0.7891640663146973, + "epoch": 1.16, + "learning_rate": 4.170390049510929e-05, + "loss": 66.6855, + "step": 1374, + "task_loss": 0.8584902882575989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6151209819508199, + "compression/movement_sparsity/importance_threshold": -0.002692343168395539, + "compression/movement_sparsity/linear_layer_sparsity": 0.2361486126183259, + "compression/movement_sparsity/model_sparsity": 0.2280361812967806, + "compression_loss": 66.27623748779297, + "distillation_loss": 0.5732203125953674, + "epoch": 1.16, + "learning_rate": 4.169786257698346e-05, + "loss": 66.9091, + "step": 1375, + "task_loss": 0.5100113153457642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6177916411663673, + "compression/movement_sparsity/importance_threshold": -0.002684217389676157, + "compression/movement_sparsity/linear_layer_sparsity": 0.2381610543902305, + "compression/movement_sparsity/model_sparsity": 0.22997948950282526, + "compression_loss": 66.56306457519531, + "distillation_loss": 0.5590233206748962, + "epoch": 1.16, + "learning_rate": 4.1691824658857624e-05, + "loss": 67.377, + "step": 1376, + "task_loss": 1.5021125078201294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6204569214125051, + "compression/movement_sparsity/importance_threshold": -0.002676107977071518, + "compression/movement_sparsity/linear_layer_sparsity": 0.24036286386836025, + "compression/movement_sparsity/model_sparsity": 0.232105660051823, + "compression_loss": 66.84934997558594, + "distillation_loss": 0.6142886281013489, + "epoch": 1.16, + "learning_rate": 4.16857867407318e-05, + "loss": 67.6273, + "step": 1377, + "task_loss": 0.8872668147087097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6231168281115813, + "compression/movement_sparsity/importance_threshold": -0.0026680149140835222, + "compression/movement_sparsity/linear_layer_sparsity": 0.24237684385788988, + "compression/movement_sparsity/model_sparsity": 0.23405045363298516, + "compression_loss": 67.13507080078125, + "distillation_loss": 1.406994342803955, + "epoch": 1.16, + "learning_rate": 4.1679748822605965e-05, + "loss": 68.2786, + "step": 1378, + "task_loss": 0.8127667903900146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6257713666859449, + "compression/movement_sparsity/importance_threshold": -0.002659938184214069, + "compression/movement_sparsity/linear_layer_sparsity": 0.2444149345143792, + "compression/movement_sparsity/model_sparsity": 0.23601852960552386, + "compression_loss": 67.42024230957031, + "distillation_loss": 0.8874384760856628, + "epoch": 1.17, + "learning_rate": 4.167371090448014e-05, + "loss": 68.2482, + "step": 1379, + "task_loss": 0.263242244720459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6284205425579432, + "compression/movement_sparsity/importance_threshold": -0.0026518777709650613, + "compression/movement_sparsity/linear_layer_sparsity": 0.24646090704567586, + "compression/movement_sparsity/model_sparsity": 0.23799421668622275, + "compression_loss": 67.7048110961914, + "distillation_loss": 0.6260169744491577, + "epoch": 1.17, + "learning_rate": 4.1667672986354306e-05, + "loss": 68.6636, + "step": 1380, + "task_loss": 0.3249625861644745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6310643611499249, + "compression/movement_sparsity/importance_threshold": -0.0026438336578383974, + "compression/movement_sparsity/linear_layer_sparsity": 0.24854806565198678, + "compression/movement_sparsity/model_sparsity": 0.240009674973556, + "compression_loss": 67.98883819580078, + "distillation_loss": 1.2066599130630493, + "epoch": 1.17, + "learning_rate": 4.166163506822848e-05, + "loss": 69.0842, + "step": 1381, + "task_loss": 1.6980273723602295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.633702827884238, + "compression/movement_sparsity/importance_threshold": -0.0026358058283359777, + "compression/movement_sparsity/linear_layer_sparsity": 0.2504901548348398, + "compression/movement_sparsity/model_sparsity": 0.24188504741841285, + "compression_loss": 68.27230834960938, + "distillation_loss": 1.112046480178833, + "epoch": 1.17, + "learning_rate": 4.165559715010265e-05, + "loss": 69.295, + "step": 1382, + "task_loss": 0.630577564239502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.636335948183231, + "compression/movement_sparsity/importance_threshold": -0.0026277942659597036, + "compression/movement_sparsity/linear_layer_sparsity": 0.25255428787344586, + "compression/movement_sparsity/model_sparsity": 0.24387827113712685, + "compression_loss": 68.55519104003906, + "distillation_loss": 1.0415713787078857, + "epoch": 1.17, + "learning_rate": 4.1649559231976815e-05, + "loss": 69.6207, + "step": 1383, + "task_loss": 1.6775456666946411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6389637274692515, + "compression/movement_sparsity/importance_threshold": -0.0026197989542114768, + "compression/movement_sparsity/linear_layer_sparsity": 0.25465263134899924, + "compression/movement_sparsity/model_sparsity": 0.24590453005903537, + "compression_loss": 68.83745574951172, + "distillation_loss": 1.1353254318237305, + "epoch": 1.17, + "learning_rate": 4.164352131385099e-05, + "loss": 69.6061, + "step": 1384, + "task_loss": 0.8452408313751221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6415861711646482, + "compression/movement_sparsity/importance_threshold": -0.0026118198765931943, + "compression/movement_sparsity/linear_layer_sparsity": 0.2566804314488188, + "compression/movement_sparsity/model_sparsity": 0.2478626689871834, + "compression_loss": 69.11917114257812, + "distillation_loss": 1.077697992324829, + "epoch": 1.17, + "learning_rate": 4.1637483395725156e-05, + "loss": 70.2182, + "step": 1385, + "task_loss": 1.9962106943130493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6442032846917692, + "compression/movement_sparsity/importance_threshold": -0.0026038570166067596, + "compression/movement_sparsity/linear_layer_sparsity": 0.2585530027343547, + "compression/movement_sparsity/model_sparsity": 0.2496709116883581, + "compression_loss": 69.40036010742188, + "distillation_loss": 0.6430842280387878, + "epoch": 1.17, + "learning_rate": 4.163144547759932e-05, + "loss": 70.3305, + "step": 1386, + "task_loss": 0.7763282060623169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6468150734729627, + "compression/movement_sparsity/importance_threshold": -0.0025959103577540715, + "compression/movement_sparsity/linear_layer_sparsity": 0.2606312659116093, + "compression/movement_sparsity/model_sparsity": 0.2516777801319886, + "compression_loss": 69.68089294433594, + "distillation_loss": 0.2612699866294861, + "epoch": 1.17, + "learning_rate": 4.16254075594735e-05, + "loss": 70.4575, + "step": 1387, + "task_loss": 0.2202429622411728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6494215429305765, + "compression/movement_sparsity/importance_threshold": -0.002587979883537032, + "compression/movement_sparsity/linear_layer_sparsity": 0.26289143226614897, + "compression/movement_sparsity/model_sparsity": 0.25386030281916483, + "compression_loss": 69.9608154296875, + "distillation_loss": 0.889277994632721, + "epoch": 1.17, + "learning_rate": 4.1619369641347664e-05, + "loss": 70.9411, + "step": 1388, + "task_loss": 0.849382758140564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6520226984869595, + "compression/movement_sparsity/importance_threshold": -0.00258006557745754, + "compression/movement_sparsity/linear_layer_sparsity": 0.264844098185695, + "compression/movement_sparsity/model_sparsity": 0.25574588865727144, + "compression_loss": 70.24024963378906, + "distillation_loss": 0.6564585566520691, + "epoch": 1.17, + "learning_rate": 4.161333172322184e-05, + "loss": 71.1757, + "step": 1389, + "task_loss": 0.9101492762565613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6546185455644595, + "compression/movement_sparsity/importance_threshold": -0.0025721674230174962, + "compression/movement_sparsity/linear_layer_sparsity": 0.2669554390039714, + "compression/movement_sparsity/model_sparsity": 0.257784698423196, + "compression_loss": 70.51902770996094, + "distillation_loss": 0.8258087635040283, + "epoch": 1.17, + "learning_rate": 4.1607293805096005e-05, + "loss": 71.4319, + "step": 1390, + "task_loss": 0.5144898891448975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6572090895854248, + "compression/movement_sparsity/importance_threshold": -0.0025642854037188014, + "compression/movement_sparsity/linear_layer_sparsity": 0.26895404874141843, + "compression/movement_sparsity/model_sparsity": 0.259714649767719, + "compression_loss": 70.79725646972656, + "distillation_loss": 0.8017131090164185, + "epoch": 1.18, + "learning_rate": 4.160125588697017e-05, + "loss": 71.6644, + "step": 1391, + "task_loss": 1.0256181955337524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6597943359722029, + "compression/movement_sparsity/importance_threshold": -0.0025564195030633573, + "compression/movement_sparsity/linear_layer_sparsity": 0.271268446210366, + "compression/movement_sparsity/model_sparsity": 0.2619495405636888, + "compression_loss": 71.0748291015625, + "distillation_loss": 1.2390258312225342, + "epoch": 1.18, + "learning_rate": 4.1595217968844346e-05, + "loss": 72.124, + "step": 1392, + "task_loss": 1.5512627363204956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6623742901471432, + "compression/movement_sparsity/importance_threshold": -0.0025485697045530623, + "compression/movement_sparsity/linear_layer_sparsity": 0.2734800931267953, + "compression/movement_sparsity/model_sparsity": 0.26408521060471707, + "compression_loss": 71.35183715820312, + "distillation_loss": 0.7582095861434937, + "epoch": 1.18, + "learning_rate": 4.1589180050718514e-05, + "loss": 72.4596, + "step": 1393, + "task_loss": 0.7546876668930054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6649489575325933, + "compression/movement_sparsity/importance_threshold": -0.002540735991689817, + "compression/movement_sparsity/linear_layer_sparsity": 0.27561110882167095, + "compression/movement_sparsity/model_sparsity": 0.26614301935470264, + "compression_loss": 71.62828826904297, + "distillation_loss": 1.2509907484054565, + "epoch": 1.18, + "learning_rate": 4.158314213259268e-05, + "loss": 72.6355, + "step": 1394, + "task_loss": 0.7522532939910889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.667518343550901, + "compression/movement_sparsity/importance_threshold": -0.0025329183479755246, + "compression/movement_sparsity/linear_layer_sparsity": 0.27791360597131787, + "compression/movement_sparsity/model_sparsity": 0.26836641864394944, + "compression_loss": 71.90416717529297, + "distillation_loss": 0.6605136394500732, + "epoch": 1.18, + "learning_rate": 4.1577104214466855e-05, + "loss": 72.7973, + "step": 1395, + "task_loss": 0.8413481116294861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6700824536244152, + "compression/movement_sparsity/importance_threshold": -0.0025251167569120825, + "compression/movement_sparsity/linear_layer_sparsity": 0.27994866788922773, + "compression/movement_sparsity/model_sparsity": 0.2703315699243964, + "compression_loss": 72.17947387695312, + "distillation_loss": 2.018920421600342, + "epoch": 1.18, + "learning_rate": 4.157106629634102e-05, + "loss": 73.5889, + "step": 1396, + "task_loss": 1.2018566131591797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6726412931754838, + "compression/movement_sparsity/importance_threshold": -0.0025173312020013916, + "compression/movement_sparsity/linear_layer_sparsity": 0.2822247172350584, + "compression/movement_sparsity/model_sparsity": 0.2725294299732509, + "compression_loss": 72.45419311523438, + "distillation_loss": 1.0406205654144287, + "epoch": 1.18, + "learning_rate": 4.1565028378215196e-05, + "loss": 73.4745, + "step": 1397, + "task_loss": 1.533910870552063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6751948676264545, + "compression/movement_sparsity/importance_threshold": -0.0025095616667453547, + "compression/movement_sparsity/linear_layer_sparsity": 0.2842650496350632, + "compression/movement_sparsity/model_sparsity": 0.274499670678519, + "compression_loss": 72.72845458984375, + "distillation_loss": 0.6194800138473511, + "epoch": 1.18, + "learning_rate": 4.155899046008936e-05, + "loss": 73.5565, + "step": 1398, + "task_loss": 0.40264472365379333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6777431823996767, + "compression/movement_sparsity/importance_threshold": -0.0025018081346458686, + "compression/movement_sparsity/linear_layer_sparsity": 0.28627910116959865, + "compression/movement_sparsity/model_sparsity": 0.27644453334689595, + "compression_loss": 73.00200653076172, + "distillation_loss": 2.0512421131134033, + "epoch": 1.18, + "learning_rate": 4.155295254196354e-05, + "loss": 74.4878, + "step": 1399, + "task_loss": 1.1340610980987549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6802862429174975, + "compression/movement_sparsity/importance_threshold": -0.0024940705892048365, + "compression/movement_sparsity/linear_layer_sparsity": 0.2884824369411858, + "compression/movement_sparsity/model_sparsity": 0.27857217775647536, + "compression_loss": 73.27507781982422, + "distillation_loss": 1.5700030326843262, + "epoch": 1.18, + "learning_rate": 4.1546914623837704e-05, + "loss": 74.572, + "step": 1400, + "task_loss": 1.5543832778930664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6828240546022653, + "compression/movement_sparsity/importance_threshold": -0.002486349013924159, + "compression/movement_sparsity/linear_layer_sparsity": 0.29054994451923283, + "compression/movement_sparsity/model_sparsity": 0.2805686600888192, + "compression_loss": 73.5475845336914, + "distillation_loss": 0.9511449337005615, + "epoch": 1.18, + "learning_rate": 4.154087670571187e-05, + "loss": 74.491, + "step": 1401, + "task_loss": 0.8736863136291504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6853566228763288, + "compression/movement_sparsity/importance_threshold": -0.0024786433923057343, + "compression/movement_sparsity/linear_layer_sparsity": 0.29287848405099653, + "compression/movement_sparsity/model_sparsity": 0.28281720712424135, + "compression_loss": 73.81948852539062, + "distillation_loss": 1.6658793687820435, + "epoch": 1.19, + "learning_rate": 4.1534838787586045e-05, + "loss": 74.9614, + "step": 1402, + "task_loss": 1.5326954126358032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.687883953162036, + "compression/movement_sparsity/importance_threshold": -0.0024709537078514638, + "compression/movement_sparsity/linear_layer_sparsity": 0.2951152313402993, + "compression/movement_sparsity/model_sparsity": 0.2849771152631171, + "compression_loss": 74.09085083007812, + "distillation_loss": 0.8680563569068909, + "epoch": 1.19, + "learning_rate": 4.152880086946021e-05, + "loss": 74.8887, + "step": 1403, + "task_loss": 1.6420655250549316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6904060508817345, + "compression/movement_sparsity/importance_threshold": -0.00246327994406325, + "compression/movement_sparsity/linear_layer_sparsity": 0.29723178301983266, + "compression/movement_sparsity/model_sparsity": 0.2870209568811839, + "compression_loss": 74.36160278320312, + "distillation_loss": 1.452422857284546, + "epoch": 1.19, + "learning_rate": 4.152276295133438e-05, + "loss": 75.4644, + "step": 1404, + "task_loss": 1.1320195198059082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6929229214577731, + "compression/movement_sparsity/importance_threshold": -0.0024556220844429905, + "compression/movement_sparsity/linear_layer_sparsity": 0.2993860389174306, + "compression/movement_sparsity/model_sparsity": 0.28910120746143303, + "compression_loss": 74.63177490234375, + "distillation_loss": 0.9171569347381592, + "epoch": 1.19, + "learning_rate": 4.1516725033208554e-05, + "loss": 75.7188, + "step": 1405, + "task_loss": 0.7533453106880188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6954345703125, + "compression/movement_sparsity/importance_threshold": -0.002447980112492587, + "compression/movement_sparsity/linear_layer_sparsity": 0.3015433831744462, + "compression/movement_sparsity/model_sparsity": 0.2911844403064529, + "compression_loss": 74.90142059326172, + "distillation_loss": 0.7895047664642334, + "epoch": 1.19, + "learning_rate": 4.151068711508272e-05, + "loss": 75.871, + "step": 1406, + "task_loss": 1.1812800168991089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6979410028682633, + "compression/movement_sparsity/importance_threshold": -0.0024403540117139398, + "compression/movement_sparsity/linear_layer_sparsity": 0.3038659844707272, + "compression/movement_sparsity/model_sparsity": 0.29342725310304935, + "compression_loss": 75.1703872680664, + "distillation_loss": 0.8484163284301758, + "epoch": 1.19, + "learning_rate": 4.150464919695689e-05, + "loss": 76.3005, + "step": 1407, + "task_loss": 1.1036725044250488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.700442224547411, + "compression/movement_sparsity/importance_threshold": -0.0024327437656089497, + "compression/movement_sparsity/linear_layer_sparsity": 0.30607543734031156, + "compression/movement_sparsity/model_sparsity": 0.2955608044694914, + "compression_loss": 75.4388656616211, + "distillation_loss": 1.0059813261032104, + "epoch": 1.19, + "learning_rate": 4.149861127883106e-05, + "loss": 76.683, + "step": 1408, + "task_loss": 1.337117314338684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7029382407722911, + "compression/movement_sparsity/importance_threshold": -0.002425149357679518, + "compression/movement_sparsity/linear_layer_sparsity": 0.30832575433238396, + "compression/movement_sparsity/model_sparsity": 0.29773381615010136, + "compression_loss": 75.70683288574219, + "distillation_loss": 0.8014774918556213, + "epoch": 1.19, + "learning_rate": 4.1492573360705236e-05, + "loss": 76.7041, + "step": 1409, + "task_loss": 0.8850807547569275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7054290569652526, + "compression/movement_sparsity/importance_threshold": -0.002417570771427543, + "compression/movement_sparsity/linear_layer_sparsity": 0.3104834324660934, + "compression/movement_sparsity/model_sparsity": 0.2998173714021235, + "compression_loss": 75.97415161132812, + "distillation_loss": 0.9386861324310303, + "epoch": 1.19, + "learning_rate": 4.1486535442579396e-05, + "loss": 77.2367, + "step": 1410, + "task_loss": 1.493424654006958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7079146785486432, + "compression/movement_sparsity/importance_threshold": -0.002410007990354926, + "compression/movement_sparsity/linear_layer_sparsity": 0.31251996105662244, + "compression/movement_sparsity/model_sparsity": 0.3017839389704732, + "compression_loss": 76.24102783203125, + "distillation_loss": 1.3480424880981445, + "epoch": 1.19, + "learning_rate": 4.148049752445357e-05, + "loss": 77.3146, + "step": 1411, + "task_loss": 1.812322974205017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7103951109448107, + "compression/movement_sparsity/importance_threshold": -0.0024024609979635693, + "compression/movement_sparsity/linear_layer_sparsity": 0.31477026612452724, + "compression/movement_sparsity/model_sparsity": 0.3039569391365473, + "compression_loss": 76.50728607177734, + "distillation_loss": 1.0416315793991089, + "epoch": 1.19, + "learning_rate": 4.1474459606327744e-05, + "loss": 77.9003, + "step": 1412, + "task_loss": 1.787137746810913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7128703595761041, + "compression/movement_sparsity/importance_threshold": -0.00239492977775537, + "compression/movement_sparsity/linear_layer_sparsity": 0.3169212786485282, + "compression/movement_sparsity/model_sparsity": 0.3060340577630603, + "compression_loss": 76.77301025390625, + "distillation_loss": 0.9747606515884399, + "epoch": 1.19, + "learning_rate": 4.146842168820191e-05, + "loss": 77.9852, + "step": 1413, + "task_loss": 1.4809799194335938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7153404298648711, + "compression/movement_sparsity/importance_threshold": -0.002387414313232231, + "compression/movement_sparsity/linear_layer_sparsity": 0.3189390147508631, + "compression/movement_sparsity/model_sparsity": 0.30798247842299775, + "compression_loss": 77.03817749023438, + "distillation_loss": 0.716783344745636, + "epoch": 1.2, + "learning_rate": 4.146238377007608e-05, + "loss": 77.8551, + "step": 1414, + "task_loss": 0.7942771911621094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7178053272334597, + "compression/movement_sparsity/importance_threshold": -0.0023799145878960534, + "compression/movement_sparsity/linear_layer_sparsity": 0.3210096941575013, + "compression/movement_sparsity/model_sparsity": 0.30998202362186295, + "compression_loss": 77.30272674560547, + "distillation_loss": 0.9607566595077515, + "epoch": 1.2, + "learning_rate": 4.145634585195025e-05, + "loss": 78.2777, + "step": 1415, + "task_loss": 0.6636053323745728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7202650571042188, + "compression/movement_sparsity/importance_threshold": -0.0023724305852487345, + "compression/movement_sparsity/linear_layer_sparsity": 0.3232173941744431, + "compression/movement_sparsity/model_sparsity": 0.3121138823515432, + "compression_loss": 77.56659698486328, + "distillation_loss": 1.8384191989898682, + "epoch": 1.2, + "learning_rate": 4.145030793382442e-05, + "loss": 79.1871, + "step": 1416, + "task_loss": 1.33530592918396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7227196248994963, + "compression/movement_sparsity/importance_threshold": -0.0023649622887921768, + "compression/movement_sparsity/linear_layer_sparsity": 0.3252054271751971, + "compression/movement_sparsity/model_sparsity": 0.3140336203028164, + "compression_loss": 77.83009338378906, + "distillation_loss": 1.251970887184143, + "epoch": 1.2, + "learning_rate": 4.144427001569859e-05, + "loss": 79.0636, + "step": 1417, + "task_loss": 1.932105302810669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7251690360416397, + "compression/movement_sparsity/importance_threshold": -0.002357509682028282, + "compression/movement_sparsity/linear_layer_sparsity": 0.32744786229246226, + "compression/movement_sparsity/model_sparsity": 0.31619902087526613, + "compression_loss": 78.09297180175781, + "distillation_loss": 1.906305193901062, + "epoch": 1.2, + "learning_rate": 4.143823209757276e-05, + "loss": 79.7876, + "step": 1418, + "task_loss": 1.1763521432876587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7276132959529984, + "compression/movement_sparsity/importance_threshold": -0.002350072748458948, + "compression/movement_sparsity/linear_layer_sparsity": 0.3294561544540296, + "compression/movement_sparsity/model_sparsity": 0.3181383220228543, + "compression_loss": 78.35538482666016, + "distillation_loss": 0.9107580184936523, + "epoch": 1.2, + "learning_rate": 4.143219417944693e-05, + "loss": 79.6057, + "step": 1419, + "task_loss": 1.141257643699646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7300524100559197, + "compression/movement_sparsity/importance_threshold": -0.002342651471586077, + "compression/movement_sparsity/linear_layer_sparsity": 0.33159490893370086, + "compression/movement_sparsity/model_sparsity": 0.3202036037065705, + "compression_loss": 78.61719512939453, + "distillation_loss": 1.711940884590149, + "epoch": 1.2, + "learning_rate": 4.1426156261321095e-05, + "loss": 79.7711, + "step": 1420, + "task_loss": 1.276822566986084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7324863837727519, + "compression/movement_sparsity/importance_threshold": -0.002335245834911569, + "compression/movement_sparsity/linear_layer_sparsity": 0.33356784593822786, + "compression/movement_sparsity/model_sparsity": 0.32210876425552787, + "compression_loss": 78.87846374511719, + "distillation_loss": 0.9947202205657959, + "epoch": 1.2, + "learning_rate": 4.142011834319527e-05, + "loss": 79.9571, + "step": 1421, + "task_loss": 0.8035528659820557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7349152225258437, + "compression/movement_sparsity/importance_threshold": -0.0023278558219373235, + "compression/movement_sparsity/linear_layer_sparsity": 0.3355727993328571, + "compression/movement_sparsity/model_sparsity": 0.3240448413330935, + "compression_loss": 79.13907623291016, + "distillation_loss": 1.3028619289398193, + "epoch": 1.2, + "learning_rate": 4.141408042506944e-05, + "loss": 80.3732, + "step": 1422, + "task_loss": 1.2800740003585815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7373389317375428, + "compression/movement_sparsity/importance_threshold": -0.002320481416165242, + "compression/movement_sparsity/linear_layer_sparsity": 0.3376931905943692, + "compression/movement_sparsity/model_sparsity": 0.32609239063168616, + "compression_loss": 79.39927673339844, + "distillation_loss": 1.0575711727142334, + "epoch": 1.2, + "learning_rate": 4.1408042506943604e-05, + "loss": 80.5298, + "step": 1423, + "task_loss": 0.9868329763412476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7397575168301977, + "compression/movement_sparsity/importance_threshold": -0.002313122601097225, + "compression/movement_sparsity/linear_layer_sparsity": 0.33998081830697424, + "compression/movement_sparsity/model_sparsity": 0.3283014312947972, + "compression_loss": 79.65879821777344, + "distillation_loss": 2.0688130855560303, + "epoch": 1.2, + "learning_rate": 4.140200458881778e-05, + "loss": 81.0386, + "step": 1424, + "task_loss": 1.3961021900177002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7421709832261562, + "compression/movement_sparsity/importance_threshold": -0.0023057793602351733, + "compression/movement_sparsity/linear_layer_sparsity": 0.34222296724421614, + "compression/movement_sparsity/model_sparsity": 0.33046655551838794, + "compression_loss": 79.9178695678711, + "distillation_loss": 2.3836803436279297, + "epoch": 1.2, + "learning_rate": 4.139596667069195e-05, + "loss": 81.5482, + "step": 1425, + "task_loss": 1.466278314590454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.744579336347767, + "compression/movement_sparsity/importance_threshold": -0.0022984516770809854, + "compression/movement_sparsity/linear_layer_sparsity": 0.34427266011581514, + "compression/movement_sparsity/model_sparsity": 0.3324458351342547, + "compression_loss": 80.17637634277344, + "distillation_loss": 0.723200798034668, + "epoch": 1.21, + "learning_rate": 4.138992875256611e-05, + "loss": 81.2356, + "step": 1426, + "task_loss": 0.7641627788543701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7469825816173778, + "compression/movement_sparsity/importance_threshold": -0.002291139535136564, + "compression/movement_sparsity/linear_layer_sparsity": 0.346427345283448, + "compression/movement_sparsity/model_sparsity": 0.33452650023779246, + "compression_loss": 80.43431854248047, + "distillation_loss": 1.9269773960113525, + "epoch": 1.21, + "learning_rate": 4.1383890834440286e-05, + "loss": 81.867, + "step": 1427, + "task_loss": 1.6321303844451904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7493807244573374, + "compression/movement_sparsity/importance_threshold": -0.0022838429179038084, + "compression/movement_sparsity/linear_layer_sparsity": 0.34862209565433727, + "compression/movement_sparsity/model_sparsity": 0.3366458541815998, + "compression_loss": 80.69171142578125, + "distillation_loss": 1.542504072189331, + "epoch": 1.21, + "learning_rate": 4.137785291631446e-05, + "loss": 81.9669, + "step": 1428, + "task_loss": 0.8136503100395203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7517737702899931, + "compression/movement_sparsity/importance_threshold": -0.0022765618088846206, + "compression/movement_sparsity/linear_layer_sparsity": 0.3507870952269751, + "compression/movement_sparsity/model_sparsity": 0.33873647935859985, + "compression_loss": 80.94857788085938, + "distillation_loss": 1.6139310598373413, + "epoch": 1.21, + "learning_rate": 4.137181499818863e-05, + "loss": 82.1961, + "step": 1429, + "task_loss": 1.1871012449264526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7541617245376939, + "compression/movement_sparsity/importance_threshold": -0.002269296191580899, + "compression/movement_sparsity/linear_layer_sparsity": 0.35298265644126364, + "compression/movement_sparsity/model_sparsity": 0.34085661629084124, + "compression_loss": 81.20484924316406, + "distillation_loss": 2.0173540115356445, + "epoch": 1.21, + "learning_rate": 4.1365777080062794e-05, + "loss": 82.7368, + "step": 1430, + "task_loss": 0.774440586566925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.756544592622788, + "compression/movement_sparsity/importance_threshold": -0.002262046049494544, + "compression/movement_sparsity/linear_layer_sparsity": 0.3549282513294016, + "compression/movement_sparsity/model_sparsity": 0.3427353740092217, + "compression_loss": 81.46064758300781, + "distillation_loss": 1.1124191284179688, + "epoch": 1.21, + "learning_rate": 4.135973916193697e-05, + "loss": 82.6803, + "step": 1431, + "task_loss": 1.066011905670166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7589223799676228, + "compression/movement_sparsity/importance_threshold": -0.0022548113661274584, + "compression/movement_sparsity/linear_layer_sparsity": 0.3571622918326511, + "compression/movement_sparsity/model_sparsity": 0.3448926683484721, + "compression_loss": 81.71576690673828, + "distillation_loss": 1.8398789167404175, + "epoch": 1.21, + "learning_rate": 4.1353701243811135e-05, + "loss": 83.3577, + "step": 1432, + "task_loss": 1.8267942667007446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7612950919945474, + "compression/movement_sparsity/importance_threshold": -0.0022475921249815404, + "compression/movement_sparsity/linear_layer_sparsity": 0.35917944365077187, + "compression/movement_sparsity/model_sparsity": 0.34684052479615557, + "compression_loss": 81.9704360961914, + "distillation_loss": 1.6999833583831787, + "epoch": 1.21, + "learning_rate": 4.13476633256853e-05, + "loss": 83.3603, + "step": 1433, + "task_loss": 1.4439703226089478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7636627341259096, + "compression/movement_sparsity/importance_threshold": -0.002240388309558691, + "compression/movement_sparsity/linear_layer_sparsity": 0.3610948823189587, + "compression/movement_sparsity/model_sparsity": 0.3486901622535116, + "compression_loss": 82.22450256347656, + "distillation_loss": 1.1816179752349854, + "epoch": 1.21, + "learning_rate": 4.1341625407559477e-05, + "loss": 83.564, + "step": 1434, + "task_loss": 0.3605434000492096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7660253117840572, + "compression/movement_sparsity/importance_threshold": -0.0022331999033608123, + "compression/movement_sparsity/linear_layer_sparsity": 0.36294585893690606, + "compression/movement_sparsity/model_sparsity": 0.3504775521303624, + "compression_loss": 82.47810363769531, + "distillation_loss": 1.076633334159851, + "epoch": 1.21, + "learning_rate": 4.1335587489433644e-05, + "loss": 83.4269, + "step": 1435, + "task_loss": 0.5194324254989624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7683828303913391, + "compression/movement_sparsity/importance_threshold": -0.002226026889889802, + "compression/movement_sparsity/linear_layer_sparsity": 0.3652759486104624, + "compression/movement_sparsity/model_sparsity": 0.35272759605543785, + "compression_loss": 82.73101806640625, + "distillation_loss": 1.6702191829681396, + "epoch": 1.21, + "learning_rate": 4.132954957130781e-05, + "loss": 83.8605, + "step": 1436, + "task_loss": 0.9430710673332214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7707352953701033, + "compression/movement_sparsity/importance_threshold": -0.002218869252647562, + "compression/movement_sparsity/linear_layer_sparsity": 0.36739728188121773, + "compression/movement_sparsity/model_sparsity": 0.35477605500235826, + "compression_loss": 82.9834213256836, + "distillation_loss": 1.8774333000183105, + "epoch": 1.21, + "learning_rate": 4.1323511653181985e-05, + "loss": 84.3656, + "step": 1437, + "task_loss": 1.7696537971496582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7730827121426974, + "compression/movement_sparsity/importance_threshold": -0.0022117269751359943, + "compression/movement_sparsity/linear_layer_sparsity": 0.3694472728570077, + "compression/movement_sparsity/model_sparsity": 0.3567556224816199, + "compression_loss": 83.23524475097656, + "distillation_loss": 1.9800899028778076, + "epoch": 1.22, + "learning_rate": 4.131747373505616e-05, + "loss": 84.7307, + "step": 1438, + "task_loss": 1.8810187578201294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7754250861314707, + "compression/movement_sparsity/importance_threshold": -0.002204600040856996, + "compression/movement_sparsity/linear_layer_sparsity": 0.3714385134588557, + "compression/movement_sparsity/model_sparsity": 0.3586784578430218, + "compression_loss": 83.48660278320312, + "distillation_loss": 1.5159274339675903, + "epoch": 1.22, + "learning_rate": 4.131143581693032e-05, + "loss": 85.0444, + "step": 1439, + "task_loss": 1.4477652311325073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7777624227587706, + "compression/movement_sparsity/importance_threshold": -0.0021974884333124697, + "compression/movement_sparsity/linear_layer_sparsity": 0.3733516984593594, + "compression/movement_sparsity/model_sparsity": 0.36052591905311276, + "compression_loss": 83.73737335205078, + "distillation_loss": 1.3582063913345337, + "epoch": 1.22, + "learning_rate": 4.130539789880449e-05, + "loss": 84.8698, + "step": 1440, + "task_loss": 0.4855771064758301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.780094727446945, + "compression/movement_sparsity/importance_threshold": -0.002190392136004317, + "compression/movement_sparsity/linear_layer_sparsity": 0.3754805081832224, + "compression/movement_sparsity/model_sparsity": 0.3625815976139763, + "compression_loss": 83.98758697509766, + "distillation_loss": 1.9738010168075562, + "epoch": 1.22, + "learning_rate": 4.129935998067867e-05, + "loss": 85.7191, + "step": 1441, + "task_loss": 1.0342822074890137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7824220056183431, + "compression/movement_sparsity/importance_threshold": -0.002183311132434435, + "compression/movement_sparsity/linear_layer_sparsity": 0.37757826737456157, + "compression/movement_sparsity/model_sparsity": 0.3646072923236309, + "compression_loss": 84.2373275756836, + "distillation_loss": 1.7988866567611694, + "epoch": 1.22, + "learning_rate": 4.1293322062552834e-05, + "loss": 85.7142, + "step": 1442, + "task_loss": 1.8961724042892456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7847442626953125, + "compression/movement_sparsity/importance_threshold": -0.0021762454061047265, + "compression/movement_sparsity/linear_layer_sparsity": 0.379596575836943, + "compression/movement_sparsity/model_sparsity": 0.36655626568128646, + "compression_loss": 84.4864273071289, + "distillation_loss": 0.8074774742126465, + "epoch": 1.22, + "learning_rate": 4.1287284144427e-05, + "loss": 85.8028, + "step": 1443, + "task_loss": 0.8500573039054871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7870615041002014, + "compression/movement_sparsity/importance_threshold": -0.002169194940517092, + "compression/movement_sparsity/linear_layer_sparsity": 0.3818171897274345, + "compression/movement_sparsity/model_sparsity": 0.36870059465323224, + "compression_loss": 84.73507690429688, + "distillation_loss": 1.3441245555877686, + "epoch": 1.22, + "learning_rate": 4.1281246226301175e-05, + "loss": 86.0975, + "step": 1444, + "task_loss": 0.8071223497390747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7893737352553581, + "compression/movement_sparsity/importance_threshold": -0.0021621597191734303, + "compression/movement_sparsity/linear_layer_sparsity": 0.3839934457143206, + "compression/movement_sparsity/model_sparsity": 0.37080208955202226, + "compression_loss": 84.98311614990234, + "distillation_loss": 1.5083156824111938, + "epoch": 1.22, + "learning_rate": 4.127520830817534e-05, + "loss": 86.5824, + "step": 1445, + "task_loss": 0.926063597202301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7916809615831306, + "compression/movement_sparsity/importance_threshold": -0.002155139725575644, + "compression/movement_sparsity/linear_layer_sparsity": 0.3859728933143768, + "compression/movement_sparsity/model_sparsity": 0.3727135370375234, + "compression_loss": 85.23066711425781, + "distillation_loss": 1.4518623352050781, + "epoch": 1.22, + "learning_rate": 4.126917039004951e-05, + "loss": 86.7656, + "step": 1446, + "task_loss": 1.374513864517212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7939831885058674, + "compression/movement_sparsity/importance_threshold": -0.0021481349432256325, + "compression/movement_sparsity/linear_layer_sparsity": 0.38789087183027005, + "compression/movement_sparsity/model_sparsity": 0.3745656270910037, + "compression_loss": 85.47763061523438, + "distillation_loss": 1.1333496570587158, + "epoch": 1.22, + "learning_rate": 4.1263132471923684e-05, + "loss": 86.8897, + "step": 1447, + "task_loss": 1.2312239408493042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7962804214459166, + "compression/movement_sparsity/importance_threshold": -0.002141145355625296, + "compression/movement_sparsity/linear_layer_sparsity": 0.3901611975756355, + "compression/movement_sparsity/model_sparsity": 0.3767579601626769, + "compression_loss": 85.72413635253906, + "distillation_loss": 1.4136972427368164, + "epoch": 1.22, + "learning_rate": 4.125709455379785e-05, + "loss": 86.9395, + "step": 1448, + "task_loss": 0.8712185621261597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7985726658256258, + "compression/movement_sparsity/importance_threshold": -0.002134170946276536, + "compression/movement_sparsity/linear_layer_sparsity": 0.39218958195966924, + "compression/movement_sparsity/model_sparsity": 0.37871666330307885, + "compression_loss": 85.97003936767578, + "distillation_loss": 1.6083614826202393, + "epoch": 1.22, + "learning_rate": 4.125105663567202e-05, + "loss": 87.662, + "step": 1449, + "task_loss": 0.5006231665611267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8008599270673442, + "compression/movement_sparsity/importance_threshold": -0.002127211698681252, + "compression/movement_sparsity/linear_layer_sparsity": 0.39427657362763324, + "compression/movement_sparsity/model_sparsity": 0.380731960386911, + "compression_loss": 86.21541595458984, + "distillation_loss": 1.863974690437317, + "epoch": 1.23, + "learning_rate": 4.124501871754619e-05, + "loss": 87.7385, + "step": 1450, + "task_loss": 1.1951030492782593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8031422105934196, + "compression/movement_sparsity/importance_threshold": -0.0021202675963413443, + "compression/movement_sparsity/linear_layer_sparsity": 0.3962438943492037, + "compression/movement_sparsity/model_sparsity": 0.3826316975895091, + "compression_loss": 86.46029663085938, + "distillation_loss": 1.480433464050293, + "epoch": 1.23, + "learning_rate": 4.123898079942036e-05, + "loss": 87.8981, + "step": 1451, + "task_loss": 1.4680736064910889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8054195218261998, + "compression/movement_sparsity/importance_threshold": -0.002113338622758714, + "compression/movement_sparsity/linear_layer_sparsity": 0.3982860749951922, + "compression/movement_sparsity/model_sparsity": 0.3846037230478253, + "compression_loss": 86.70458984375, + "distillation_loss": 1.581674575805664, + "epoch": 1.23, + "learning_rate": 4.123294288129453e-05, + "loss": 88.3844, + "step": 1452, + "task_loss": 2.433668851852417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8076918661880335, + "compression/movement_sparsity/importance_threshold": -0.002106424761435261, + "compression/movement_sparsity/linear_layer_sparsity": 0.4002846728084715, + "compression/movement_sparsity/model_sparsity": 0.3865336628778125, + "compression_loss": 86.94840240478516, + "distillation_loss": 1.4813472032546997, + "epoch": 1.23, + "learning_rate": 4.12269049631687e-05, + "loss": 88.6527, + "step": 1453, + "task_loss": 0.9392086267471313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8099592491012688, + "compression/movement_sparsity/importance_threshold": -0.0020995259958728864, + "compression/movement_sparsity/linear_layer_sparsity": 0.40240111717049615, + "compression/movement_sparsity/model_sparsity": 0.38857740086505715, + "compression_loss": 87.19169616699219, + "distillation_loss": 1.459622859954834, + "epoch": 1.23, + "learning_rate": 4.1220867045042874e-05, + "loss": 88.6092, + "step": 1454, + "task_loss": 1.4059369564056396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8122216759882535, + "compression/movement_sparsity/importance_threshold": -0.002092642309573491, + "compression/movement_sparsity/linear_layer_sparsity": 0.40416343760207085, + "compression/movement_sparsity/model_sparsity": 0.3902791801682755, + "compression_loss": 87.43447875976562, + "distillation_loss": 2.5178380012512207, + "epoch": 1.23, + "learning_rate": 4.121482912691704e-05, + "loss": 89.6026, + "step": 1455, + "task_loss": 1.5118138790130615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8144791522713364, + "compression/movement_sparsity/importance_threshold": -0.002085773686038974, + "compression/movement_sparsity/linear_layer_sparsity": 0.4061536765738375, + "compression/movement_sparsity/model_sparsity": 0.3922010483086707, + "compression_loss": 87.67665100097656, + "distillation_loss": 1.2558326721191406, + "epoch": 1.23, + "learning_rate": 4.120879120879121e-05, + "loss": 89.4982, + "step": 1456, + "task_loss": 0.8126417398452759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8167316833728653, + "compression/movement_sparsity/importance_threshold": -0.002078920108771236, + "compression/movement_sparsity/linear_layer_sparsity": 0.4083373136204902, + "compression/movement_sparsity/model_sparsity": 0.3943096707051176, + "compression_loss": 87.91842651367188, + "distillation_loss": 1.2887520790100098, + "epoch": 1.23, + "learning_rate": 4.120275329066538e-05, + "loss": 89.504, + "step": 1457, + "task_loss": 0.39491498470306396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8189792747151883, + "compression/movement_sparsity/importance_threshold": -0.0020720815612721793, + "compression/movement_sparsity/linear_layer_sparsity": 0.4103506616291351, + "compression/movement_sparsity/model_sparsity": 0.39625385401588265, + "compression_loss": 88.15950012207031, + "distillation_loss": 2.394442081451416, + "epoch": 1.23, + "learning_rate": 4.119671537253955e-05, + "loss": 89.8026, + "step": 1458, + "task_loss": 1.57588529586792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.821221931720654, + "compression/movement_sparsity/importance_threshold": -0.0020652580270437014, + "compression/movement_sparsity/linear_layer_sparsity": 0.4122462584823759, + "compression/movement_sparsity/model_sparsity": 0.39808433128567655, + "compression_loss": 88.40016174316406, + "distillation_loss": 2.413412570953369, + "epoch": 1.23, + "learning_rate": 4.119067745441372e-05, + "loss": 90.5116, + "step": 1459, + "task_loss": 1.7402560710906982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8234596598116104, + "compression/movement_sparsity/importance_threshold": -0.0020584494895877054, + "compression/movement_sparsity/linear_layer_sparsity": 0.41412053492388373, + "compression/movement_sparsity/model_sparsity": 0.39989422056546986, + "compression_loss": 88.6402816772461, + "distillation_loss": 1.2922289371490479, + "epoch": 1.23, + "learning_rate": 4.118463953628789e-05, + "loss": 90.4996, + "step": 1460, + "task_loss": 1.291831135749817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8256924644104057, + "compression/movement_sparsity/importance_threshold": -0.00205165593240609, + "compression/movement_sparsity/linear_layer_sparsity": 0.41614310031411117, + "compression/movement_sparsity/model_sparsity": 0.4018473046124041, + "compression_loss": 88.87980651855469, + "distillation_loss": 1.0690573453903198, + "epoch": 1.23, + "learning_rate": 4.117860161816206e-05, + "loss": 90.2168, + "step": 1461, + "task_loss": 1.7019424438476562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8279203509393878, + "compression/movement_sparsity/importance_threshold": -0.0020448773390007572, + "compression/movement_sparsity/linear_layer_sparsity": 0.4183557607847896, + "compression/movement_sparsity/model_sparsity": 0.40398395338897486, + "compression_loss": 89.1187973022461, + "distillation_loss": 3.436924457550049, + "epoch": 1.24, + "learning_rate": 4.117256370003623e-05, + "loss": 91.0768, + "step": 1462, + "task_loss": 2.285360097885132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8301433248209054, + "compression/movement_sparsity/importance_threshold": -0.0020381136928736064, + "compression/movement_sparsity/linear_layer_sparsity": 0.4202366789876706, + "compression/movement_sparsity/model_sparsity": 0.4058002562652057, + "compression_loss": 89.35726928710938, + "distillation_loss": 2.199679136276245, + "epoch": 1.24, + "learning_rate": 4.11665257819104e-05, + "loss": 91.1663, + "step": 1463, + "task_loss": 1.2185769081115723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8323613914773064, + "compression/movement_sparsity/importance_threshold": -0.002031364977526538, + "compression/movement_sparsity/linear_layer_sparsity": 0.4223727505296238, + "compression/movement_sparsity/model_sparsity": 0.4078629471783682, + "compression_loss": 89.59520721435547, + "distillation_loss": 1.9171276092529297, + "epoch": 1.24, + "learning_rate": 4.1160487863784567e-05, + "loss": 91.434, + "step": 1464, + "task_loss": 1.48882257938385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8345745563309392, + "compression/movement_sparsity/importance_threshold": -0.0020246311764614525, + "compression/movement_sparsity/linear_layer_sparsity": 0.4245852798344582, + "compression/movement_sparsity/model_sparsity": 0.4099994692950452, + "compression_loss": 89.83258819580078, + "distillation_loss": 1.6365708112716675, + "epoch": 1.24, + "learning_rate": 4.115444994565874e-05, + "loss": 92.1061, + "step": 1465, + "task_loss": 1.5315965414047241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8367828248041516, + "compression/movement_sparsity/importance_threshold": -0.0020179122731802516, + "compression/movement_sparsity/linear_layer_sparsity": 0.42640592136993993, + "compression/movement_sparsity/model_sparsity": 0.4117575661928347, + "compression_loss": 90.0694351196289, + "distillation_loss": 2.515531301498413, + "epoch": 1.24, + "learning_rate": 4.114841202753291e-05, + "loss": 91.7685, + "step": 1466, + "task_loss": 2.0843374729156494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8389862023192922, + "compression/movement_sparsity/importance_threshold": -0.002011208251184834, + "compression/movement_sparsity/linear_layer_sparsity": 0.42824732288127576, + "compression/movement_sparsity/model_sparsity": 0.4135357098974424, + "compression_loss": 90.30577850341797, + "distillation_loss": 1.041003704071045, + "epoch": 1.24, + "learning_rate": 4.1142374109407075e-05, + "loss": 91.8211, + "step": 1467, + "task_loss": 0.6549727320671082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8411846942987089, + "compression/movement_sparsity/importance_threshold": -0.0020045190939771015, + "compression/movement_sparsity/linear_layer_sparsity": 0.4302388138906441, + "compression/movement_sparsity/model_sparsity": 0.4154587870640961, + "compression_loss": 90.5416259765625, + "distillation_loss": 0.758406400680542, + "epoch": 1.24, + "learning_rate": 4.113633619128125e-05, + "loss": 92.1033, + "step": 1468, + "task_loss": 0.7709652781486511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.84337830616475, + "compression/movement_sparsity/importance_threshold": -0.001997844785058954, + "compression/movement_sparsity/linear_layer_sparsity": 0.43213488771059033, + "compression/movement_sparsity/model_sparsity": 0.4172897249153218, + "compression_loss": 90.7768783569336, + "distillation_loss": 2.275209903717041, + "epoch": 1.24, + "learning_rate": 4.1130298273155416e-05, + "loss": 92.3913, + "step": 1469, + "task_loss": 1.299243688583374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.845567043339764, + "compression/movement_sparsity/importance_threshold": -0.001991185307932291, + "compression/movement_sparsity/linear_layer_sparsity": 0.4342314067884954, + "compression/movement_sparsity/model_sparsity": 0.41931422211325375, + "compression_loss": 91.01165008544922, + "distillation_loss": 1.088546872138977, + "epoch": 1.24, + "learning_rate": 4.112426035502959e-05, + "loss": 92.3818, + "step": 1470, + "task_loss": 0.5441562533378601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8477509112460988, + "compression/movement_sparsity/importance_threshold": -0.0019845406460990145, + "compression/movement_sparsity/linear_layer_sparsity": 0.43640322698502093, + "compression/movement_sparsity/model_sparsity": 0.42141143360472827, + "compression_loss": 91.24581146240234, + "distillation_loss": 2.9870645999908447, + "epoch": 1.24, + "learning_rate": 4.111822243690376e-05, + "loss": 93.25, + "step": 1471, + "task_loss": 2.1369879245758057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8499299153061023, + "compression/movement_sparsity/importance_threshold": -0.001977910783061025, + "compression/movement_sparsity/linear_layer_sparsity": 0.4386132879871547, + "compression/movement_sparsity/model_sparsity": 0.42354557221249584, + "compression_loss": 91.47949981689453, + "distillation_loss": 2.1912131309509277, + "epoch": 1.24, + "learning_rate": 4.111218451877793e-05, + "loss": 93.7806, + "step": 1472, + "task_loss": 1.258070945739746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8521040609421231, + "compression/movement_sparsity/importance_threshold": -0.0019712957023202225, + "compression/movement_sparsity/linear_layer_sparsity": 0.4405848417882359, + "compression/movement_sparsity/model_sparsity": 0.425449397075301, + "compression_loss": 91.71265411376953, + "distillation_loss": 2.145315647125244, + "epoch": 1.24, + "learning_rate": 4.11061466006521e-05, + "loss": 93.3714, + "step": 1473, + "task_loss": 1.054006814956665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8542733535765097, + "compression/movement_sparsity/importance_threshold": -0.001964695387378506, + "compression/movement_sparsity/linear_layer_sparsity": 0.4426664198840933, + "compression/movement_sparsity/model_sparsity": 0.4274594665598824, + "compression_loss": 91.9453125, + "distillation_loss": 2.528749942779541, + "epoch": 1.25, + "learning_rate": 4.1100108682526265e-05, + "loss": 93.7783, + "step": 1474, + "task_loss": 1.4758806228637695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8564377986316093, + "compression/movement_sparsity/importance_threshold": -0.001958109821737779, + "compression/movement_sparsity/linear_layer_sparsity": 0.44477889349829514, + "compression/movement_sparsity/model_sparsity": 0.4294993702067075, + "compression_loss": 92.17738342285156, + "distillation_loss": 1.3167691230773926, + "epoch": 1.25, + "learning_rate": 4.109407076440044e-05, + "loss": 93.6363, + "step": 1475, + "task_loss": 0.7906906008720398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.858597401529771, + "compression/movement_sparsity/importance_threshold": -0.001951538988899939, + "compression/movement_sparsity/linear_layer_sparsity": 0.4467375811224973, + "compression/movement_sparsity/model_sparsity": 0.4313907708853903, + "compression_loss": 92.40898132324219, + "distillation_loss": 2.044130325317383, + "epoch": 1.25, + "learning_rate": 4.108803284627461e-05, + "loss": 94.9576, + "step": 1476, + "task_loss": 2.0637025833129883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.860752167693343, + "compression/movement_sparsity/importance_threshold": -0.0019449828723668875, + "compression/movement_sparsity/linear_layer_sparsity": 0.4489042739269394, + "compression/movement_sparsity/model_sparsity": 0.43348303112647313, + "compression_loss": 92.64002227783203, + "distillation_loss": 1.6544620990753174, + "epoch": 1.25, + "learning_rate": 4.1081994928148774e-05, + "loss": 94.7044, + "step": 1477, + "task_loss": 1.0498977899551392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8629021025446726, + "compression/movement_sparsity/importance_threshold": -0.0019384414556405266, + "compression/movement_sparsity/linear_layer_sparsity": 0.4508674808106755, + "compression/movement_sparsity/model_sparsity": 0.4353787958142221, + "compression_loss": 92.87060546875, + "distillation_loss": 1.251671314239502, + "epoch": 1.25, + "learning_rate": 4.107595701002295e-05, + "loss": 94.4082, + "step": 1478, + "task_loss": 0.5553045868873596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.865047211506109, + "compression/movement_sparsity/importance_threshold": -0.0019319147222227537, + "compression/movement_sparsity/linear_layer_sparsity": 0.45276639258251905, + "compression/movement_sparsity/model_sparsity": 0.43721247412496694, + "compression_loss": 93.10057067871094, + "distillation_loss": 1.6820611953735352, + "epoch": 1.25, + "learning_rate": 4.1069919091897115e-05, + "loss": 94.7123, + "step": 1479, + "task_loss": 1.2488874197006226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8671875, + "compression/movement_sparsity/importance_threshold": -0.0019254026556154713, + "compression/movement_sparsity/linear_layer_sparsity": 0.45471199939482465, + "compression/movement_sparsity/model_sparsity": 0.4390912433578832, + "compression_loss": 93.330078125, + "distillation_loss": 1.3987200260162354, + "epoch": 1.25, + "learning_rate": 4.106388117377128e-05, + "loss": 95.0843, + "step": 1480, + "task_loss": 1.1722813844680786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8693229734486936, + "compression/movement_sparsity/importance_threshold": -0.0019189052393205801, + "compression/movement_sparsity/linear_layer_sparsity": 0.4566409242966077, + "compression/movement_sparsity/model_sparsity": 0.4409539037552229, + "compression_loss": 93.55905151367188, + "distillation_loss": 2.373427391052246, + "epoch": 1.25, + "learning_rate": 4.1057843255645456e-05, + "loss": 95.5699, + "step": 1481, + "task_loss": 2.5787312984466553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8714536372745385, + "compression/movement_sparsity/importance_threshold": -0.0019124224568399786, + "compression/movement_sparsity/linear_layer_sparsity": 0.4584991031118071, + "compression/movement_sparsity/model_sparsity": 0.4427482484116936, + "compression_loss": 93.78746795654297, + "distillation_loss": 2.107168674468994, + "epoch": 1.25, + "learning_rate": 4.105180533751963e-05, + "loss": 95.8652, + "step": 1482, + "task_loss": 1.9090006351470947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.873579496899882, + "compression/movement_sparsity/importance_threshold": -0.00190595429167557, + "compression/movement_sparsity/linear_layer_sparsity": 0.46043322695067934, + "compression/movement_sparsity/model_sparsity": 0.44461592914663967, + "compression_loss": 94.0154800415039, + "distillation_loss": 1.4259358644485474, + "epoch": 1.25, + "learning_rate": 4.104576741939379e-05, + "loss": 96.005, + "step": 1483, + "task_loss": 2.0791573524475098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8757005577470733, + "compression/movement_sparsity/importance_threshold": -0.0018995007273292525, + "compression/movement_sparsity/linear_layer_sparsity": 0.46233469049439696, + "compression/movement_sparsity/model_sparsity": 0.4464520715680445, + "compression_loss": 94.24298095703125, + "distillation_loss": 2.386120319366455, + "epoch": 1.25, + "learning_rate": 4.1039729501267964e-05, + "loss": 96.691, + "step": 1484, + "task_loss": 1.3493601083755493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.87781682523846, + "compression/movement_sparsity/importance_threshold": -0.0018930617473029276, + "compression/movement_sparsity/linear_layer_sparsity": 0.4642194959759273, + "compression/movement_sparsity/model_sparsity": 0.44827212818294443, + "compression_loss": 94.46988677978516, + "distillation_loss": 2.0142054557800293, + "epoch": 1.26, + "learning_rate": 4.103369158314214e-05, + "loss": 97.2868, + "step": 1485, + "task_loss": 1.6579411029815674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8799283047963904, + "compression/movement_sparsity/importance_threshold": -0.0018866373350984959, + "compression/movement_sparsity/linear_layer_sparsity": 0.46618483728567023, + "compression/movement_sparsity/model_sparsity": 0.4501699539726006, + "compression_loss": 94.69634246826172, + "distillation_loss": 1.5398972034454346, + "epoch": 1.26, + "learning_rate": 4.1027653665016306e-05, + "loss": 96.8952, + "step": 1486, + "task_loss": 1.0231614112854004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8820350018432129, + "compression/movement_sparsity/importance_threshold": -0.0018802274742178572, + "compression/movement_sparsity/linear_layer_sparsity": 0.46836697188720083, + "compression/movement_sparsity/model_sparsity": 0.4522771255375374, + "compression_loss": 94.92230224609375, + "distillation_loss": 1.5074758529663086, + "epoch": 1.26, + "learning_rate": 4.102161574689047e-05, + "loss": 96.5317, + "step": 1487, + "task_loss": 0.5253912210464478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8841369218012756, + "compression/movement_sparsity/importance_threshold": -0.0018738321481629118, + "compression/movement_sparsity/linear_layer_sparsity": 0.47018136515884135, + "compression/movement_sparsity/model_sparsity": 0.4540291888185705, + "compression_loss": 95.14779663085938, + "distillation_loss": 4.042708873748779, + "epoch": 1.26, + "learning_rate": 4.101557782876465e-05, + "loss": 97.6667, + "step": 1488, + "task_loss": 2.2897398471832275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8862340700929262, + "compression/movement_sparsity/importance_threshold": -0.0018674513404355621, + "compression/movement_sparsity/linear_layer_sparsity": 0.47206333268847434, + "compression/movement_sparsity/model_sparsity": 0.4558465049739513, + "compression_loss": 95.37268829345703, + "distillation_loss": 2.2302136421203613, + "epoch": 1.26, + "learning_rate": 4.1009539910638814e-05, + "loss": 97.3186, + "step": 1489, + "task_loss": 1.1984933614730835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8883264521405138, + "compression/movement_sparsity/importance_threshold": -0.0018610850345377058, + "compression/movement_sparsity/linear_layer_sparsity": 0.473868258171012, + "compression/movement_sparsity/model_sparsity": 0.45758942571356354, + "compression_loss": 95.59718322753906, + "distillation_loss": 2.9210283756256104, + "epoch": 1.26, + "learning_rate": 4.100350199251298e-05, + "loss": 97.5623, + "step": 1490, + "task_loss": 1.9384020566940308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8904140733663858, + "compression/movement_sparsity/importance_threshold": -0.0018547332139712451, + "compression/movement_sparsity/linear_layer_sparsity": 0.47591146429541714, + "compression/movement_sparsity/model_sparsity": 0.4595624414219581, + "compression_loss": 95.82103729248047, + "distillation_loss": 2.1385228633880615, + "epoch": 1.26, + "learning_rate": 4.0997464074387155e-05, + "loss": 98.023, + "step": 1491, + "task_loss": 1.2332741022109985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8924969391928907, + "compression/movement_sparsity/importance_threshold": -0.0018483958622380805, + "compression/movement_sparsity/linear_layer_sparsity": 0.4774953037858183, + "compression/movement_sparsity/model_sparsity": 0.4610918711534038, + "compression_loss": 96.04447937011719, + "distillation_loss": 1.8286014795303345, + "epoch": 1.26, + "learning_rate": 4.099142615626132e-05, + "loss": 98.1788, + "step": 1492, + "task_loss": 1.8470790386199951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8945750550423769, + "compression/movement_sparsity/importance_threshold": -0.0018420729628401116, + "compression/movement_sparsity/linear_layer_sparsity": 0.4793622230158948, + "compression/movement_sparsity/model_sparsity": 0.4628946559646119, + "compression_loss": 96.26741027832031, + "distillation_loss": 2.382716655731201, + "epoch": 1.26, + "learning_rate": 4.098538823813549e-05, + "loss": 98.6504, + "step": 1493, + "task_loss": 3.0367345809936523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8966484263371924, + "compression/movement_sparsity/importance_threshold": -0.0018357644992792388, + "compression/movement_sparsity/linear_layer_sparsity": 0.4812443336355394, + "compression/movement_sparsity/model_sparsity": 0.46471211029442216, + "compression_loss": 96.4897232055664, + "distillation_loss": 1.8656718730926514, + "epoch": 1.26, + "learning_rate": 4.097935032000966e-05, + "loss": 98.6476, + "step": 1494, + "task_loss": 1.6977344751358032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8987170584996854, + "compression/movement_sparsity/importance_threshold": -0.0018294704550573635, + "compression/movement_sparsity/linear_layer_sparsity": 0.48303757343379394, + "compression/movement_sparsity/model_sparsity": 0.4664437467889558, + "compression_loss": 96.71153259277344, + "distillation_loss": 1.585442304611206, + "epoch": 1.26, + "learning_rate": 4.097331240188384e-05, + "loss": 98.652, + "step": 1495, + "task_loss": 1.535503625869751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9007809569522041, + "compression/movement_sparsity/importance_threshold": -0.0018231908136763855, + "compression/movement_sparsity/linear_layer_sparsity": 0.48498663825471394, + "compression/movement_sparsity/model_sparsity": 0.46832585523725245, + "compression_loss": 96.93289947509766, + "distillation_loss": 2.3262600898742676, + "epoch": 1.26, + "learning_rate": 4.0967274483758e-05, + "loss": 98.9485, + "step": 1496, + "task_loss": 1.2926844358444214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9028401271170969, + "compression/movement_sparsity/importance_threshold": -0.0018169255586382048, + "compression/movement_sparsity/linear_layer_sparsity": 0.48660384156616027, + "compression/movement_sparsity/model_sparsity": 0.46988750263985124, + "compression_loss": 97.1536636352539, + "distillation_loss": 2.212984323501587, + "epoch": 1.27, + "learning_rate": 4.096123656563217e-05, + "loss": 99.3644, + "step": 1497, + "task_loss": 1.1809406280517578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9048945744167113, + "compression/movement_sparsity/importance_threshold": -0.001810674673444724, + "compression/movement_sparsity/linear_layer_sparsity": 0.48842699910101317, + "compression/movement_sparsity/model_sparsity": 0.4716480291046934, + "compression_loss": 97.37403869628906, + "distillation_loss": 3.554448127746582, + "epoch": 1.27, + "learning_rate": 4.0955198647506346e-05, + "loss": 100.0839, + "step": 1498, + "task_loss": 2.1796820163726807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9069443042733967, + "compression/movement_sparsity/importance_threshold": -0.0018044381415978396, + "compression/movement_sparsity/linear_layer_sparsity": 0.49017284033291514, + "compression/movement_sparsity/model_sparsity": 0.4733338953194437, + "compression_loss": 97.59376525878906, + "distillation_loss": 3.316885232925415, + "epoch": 1.27, + "learning_rate": 4.0949160729380506e-05, + "loss": 99.9674, + "step": 1499, + "task_loss": 1.997530221939087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9089893221095, + "compression/movement_sparsity/importance_threshold": -0.0017982159465994565, + "compression/movement_sparsity/linear_layer_sparsity": 0.49200291388499684, + "compression/movement_sparsity/model_sparsity": 0.47510110021504665, + "compression_loss": 97.81307983398438, + "distillation_loss": 2.1376092433929443, + "epoch": 1.27, + "learning_rate": 4.094312281125468e-05, + "loss": 99.9552, + "step": 1500, + "task_loss": 1.950959324836731 + }, + { + "epoch": 1.27, + "eval_accuracy": 0.7577029702970297, + "eval_loss": 99.48823547363281, + "eval_runtime": 309.4335, + "eval_samples_per_second": 81.601, + "eval_steps_per_second": 0.64, + "step": 1500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9110296333473702, + "compression/movement_sparsity/importance_threshold": -0.0017920080719514724, + "compression/movement_sparsity/linear_layer_sparsity": 0.4938485723241786, + "compression/movement_sparsity/model_sparsity": 0.47688335460893305, + "compression_loss": 98.0318374633789, + "distillation_loss": 1.574967861175537, + "epoch": 1.27, + "learning_rate": 4.0937084893128854e-05, + "loss": 100.5241, + "step": 1501, + "task_loss": 1.9993964433670044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9130652434093551, + "compression/movement_sparsity/importance_threshold": -0.0017858145011557893, + "compression/movement_sparsity/linear_layer_sparsity": 0.4958555289789707, + "compression/movement_sparsity/model_sparsity": 0.47882136612851217, + "compression_loss": 98.25009155273438, + "distillation_loss": 3.003077983856201, + "epoch": 1.27, + "learning_rate": 4.093104697500302e-05, + "loss": 100.6841, + "step": 1502, + "task_loss": 1.8956490755081177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9150961577178031, + "compression/movement_sparsity/importance_threshold": -0.001779635217714307, + "compression/movement_sparsity/linear_layer_sparsity": 0.49774046562634505, + "compression/movement_sparsity/model_sparsity": 0.48064154940330583, + "compression_loss": 98.46780395507812, + "distillation_loss": 2.5956430435180664, + "epoch": 1.27, + "learning_rate": 4.092500905687719e-05, + "loss": 100.6119, + "step": 1503, + "task_loss": 1.0168830156326294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9171223816950625, + "compression/movement_sparsity/importance_threshold": -0.0017734702051289248, + "compression/movement_sparsity/linear_layer_sparsity": 0.499530056629303, + "compression/movement_sparsity/model_sparsity": 0.48236966244988627, + "compression_loss": 98.6850814819336, + "distillation_loss": 1.9505882263183594, + "epoch": 1.27, + "learning_rate": 4.091897113875136e-05, + "loss": 101.0449, + "step": 1504, + "task_loss": 0.9648960828781128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9191439207634815, + "compression/movement_sparsity/importance_threshold": -0.0017673194469015443, + "compression/movement_sparsity/linear_layer_sparsity": 0.5012300732849804, + "compression/movement_sparsity/model_sparsity": 0.4840112783035782, + "compression_loss": 98.90174865722656, + "distillation_loss": 3.3877549171447754, + "epoch": 1.27, + "learning_rate": 4.091293322062553e-05, + "loss": 101.4548, + "step": 1505, + "task_loss": 1.6129616498947144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9211607803454076, + "compression/movement_sparsity/importance_threshold": -0.001761182926534067, + "compression/movement_sparsity/linear_layer_sparsity": 0.5031061383516336, + "compression/movement_sparsity/model_sparsity": 0.4858228947637407, + "compression_loss": 99.11808013916016, + "distillation_loss": 1.7419830560684204, + "epoch": 1.27, + "learning_rate": 4.09068953024997e-05, + "loss": 100.9467, + "step": 1506, + "task_loss": 1.4925693273544312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9231729658631901, + "compression/movement_sparsity/importance_threshold": -0.001755060627528391, + "compression/movement_sparsity/linear_layer_sparsity": 0.5047107616662241, + "compression/movement_sparsity/model_sparsity": 0.4873723943310762, + "compression_loss": 99.333740234375, + "distillation_loss": 3.8648133277893066, + "epoch": 1.27, + "learning_rate": 4.090085738437387e-05, + "loss": 102.3531, + "step": 1507, + "task_loss": 1.719414472579956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9251804827391765, + "compression/movement_sparsity/importance_threshold": -0.0017489525333864182, + "compression/movement_sparsity/linear_layer_sparsity": 0.5064230125178958, + "compression/movement_sparsity/model_sparsity": 0.4890258240984933, + "compression_loss": 99.54905700683594, + "distillation_loss": 4.190714359283447, + "epoch": 1.27, + "learning_rate": 4.089481946624804e-05, + "loss": 102.3868, + "step": 1508, + "task_loss": 3.118457317352295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9271833363957147, + "compression/movement_sparsity/importance_threshold": -0.0017428586276100502, + "compression/movement_sparsity/linear_layer_sparsity": 0.5081770933496341, + "compression/movement_sparsity/model_sparsity": 0.49071964685747765, + "compression_loss": 99.76375579833984, + "distillation_loss": 3.2744250297546387, + "epoch": 1.28, + "learning_rate": 4.0888781548122205e-05, + "loss": 102.5605, + "step": 1509, + "task_loss": 1.922554850578308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9291815322551539, + "compression/movement_sparsity/importance_threshold": -0.0017367788937011847, + "compression/movement_sparsity/linear_layer_sparsity": 0.5101608455742068, + "compression/movement_sparsity/model_sparsity": 0.4926352510904006, + "compression_loss": 99.97798919677734, + "distillation_loss": 1.5416498184204102, + "epoch": 1.28, + "learning_rate": 4.088274362999638e-05, + "loss": 102.285, + "step": 1510, + "task_loss": 2.3689537048339844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9311750757398416, + "compression/movement_sparsity/importance_threshold": -0.0017307133151617236, + "compression/movement_sparsity/linear_layer_sparsity": 0.5121285001724711, + "compression/movement_sparsity/model_sparsity": 0.494535310700001, + "compression_loss": 100.1917724609375, + "distillation_loss": 2.8311927318573, + "epoch": 1.28, + "learning_rate": 4.087670571187055e-05, + "loss": 102.6318, + "step": 1511, + "task_loss": 2.023484706878662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9331639722721259, + "compression/movement_sparsity/importance_threshold": -0.0017246618754935684, + "compression/movement_sparsity/linear_layer_sparsity": 0.5138681408372024, + "compression/movement_sparsity/model_sparsity": 0.4962151893561381, + "compression_loss": 100.40502166748047, + "distillation_loss": 1.8619507551193237, + "epoch": 1.28, + "learning_rate": 4.087066779374472e-05, + "loss": 102.2339, + "step": 1512, + "task_loss": 1.015195608139038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9351482272743554, + "compression/movement_sparsity/importance_threshold": -0.0017186245581986173, + "compression/movement_sparsity/linear_layer_sparsity": 0.5157130838263261, + "compression/movement_sparsity/model_sparsity": 0.49799675287787687, + "compression_loss": 100.61775207519531, + "distillation_loss": 2.4256768226623535, + "epoch": 1.28, + "learning_rate": 4.086462987561889e-05, + "loss": 102.7127, + "step": 1513, + "task_loss": 1.0213738679885864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9371278461688781, + "compression/movement_sparsity/importance_threshold": -0.0017126013467787726, + "compression/movement_sparsity/linear_layer_sparsity": 0.5174307124776016, + "compression/movement_sparsity/model_sparsity": 0.49965537570093727, + "compression_loss": 100.83002471923828, + "distillation_loss": 3.780240535736084, + "epoch": 1.28, + "learning_rate": 4.085859195749306e-05, + "loss": 104.5235, + "step": 1514, + "task_loss": 1.6220946311950684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9391028343780421, + "compression/movement_sparsity/importance_threshold": -0.0017065922247359346, + "compression/movement_sparsity/linear_layer_sparsity": 0.5190585640707468, + "compression/movement_sparsity/model_sparsity": 0.5012273055840006, + "compression_loss": 101.04183197021484, + "distillation_loss": 2.694791316986084, + "epoch": 1.28, + "learning_rate": 4.085255403936723e-05, + "loss": 103.2456, + "step": 1515, + "task_loss": 1.0968552827835083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9410731973241961, + "compression/movement_sparsity/importance_threshold": -0.0017005971755720017, + "compression/movement_sparsity/linear_layer_sparsity": 0.5210256343847969, + "compression/movement_sparsity/model_sparsity": 0.503126800981347, + "compression_loss": 101.25298309326172, + "distillation_loss": 3.001863956451416, + "epoch": 1.28, + "learning_rate": 4.0846516121241396e-05, + "loss": 104.2717, + "step": 1516, + "task_loss": 2.290720224380493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9430389404296875, + "compression/movement_sparsity/importance_threshold": -0.0016946161827888773, + "compression/movement_sparsity/linear_layer_sparsity": 0.5226865755431316, + "compression/movement_sparsity/model_sparsity": 0.5047306837012402, + "compression_loss": 101.46376037597656, + "distillation_loss": 2.0780413150787354, + "epoch": 1.28, + "learning_rate": 4.084047820311557e-05, + "loss": 103.4117, + "step": 1517, + "task_loss": 0.8307811617851257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9450000691168647, + "compression/movement_sparsity/importance_threshold": -0.001688649229888461, + "compression/movement_sparsity/linear_layer_sparsity": 0.5244397262897943, + "compression/movement_sparsity/model_sparsity": 0.5064236083264326, + "compression_loss": 101.67395782470703, + "distillation_loss": 3.2058920860290527, + "epoch": 1.28, + "learning_rate": 4.083444028498974e-05, + "loss": 104.2834, + "step": 1518, + "task_loss": 1.202286720275879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9469565888080768, + "compression/movement_sparsity/importance_threshold": -0.0016826963003726508, + "compression/movement_sparsity/linear_layer_sparsity": 0.526173297553199, + "compression/movement_sparsity/model_sparsity": 0.5080976260838503, + "compression_loss": 101.88365936279297, + "distillation_loss": 2.6888389587402344, + "epoch": 1.28, + "learning_rate": 4.0828402366863904e-05, + "loss": 104.6186, + "step": 1519, + "task_loss": 0.8831520676612854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9489085049256708, + "compression/movement_sparsity/importance_threshold": -0.0016767573777433501, + "compression/movement_sparsity/linear_layer_sparsity": 0.5278062765384275, + "compression/movement_sparsity/model_sparsity": 0.5096745072173052, + "compression_loss": 102.0929183959961, + "distillation_loss": 2.503772258758545, + "epoch": 1.28, + "learning_rate": 4.082236444873808e-05, + "loss": 104.7031, + "step": 1520, + "task_loss": 2.653703451156616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9508558228919954, + "compression/movement_sparsity/importance_threshold": -0.0016708324455024588, + "compression/movement_sparsity/linear_layer_sparsity": 0.5297526584216294, + "compression/movement_sparsity/model_sparsity": 0.5115540248950482, + "compression_loss": 102.3016128540039, + "distillation_loss": 3.3353288173675537, + "epoch": 1.29, + "learning_rate": 4.0816326530612245e-05, + "loss": 105.0724, + "step": 1521, + "task_loss": 2.957670211791992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9527985481293989, + "compression/movement_sparsity/importance_threshold": -0.0016649214871518763, + "compression/movement_sparsity/linear_layer_sparsity": 0.5316790911723766, + "compression/movement_sparsity/model_sparsity": 0.5134142787544068, + "compression_loss": 102.50987243652344, + "distillation_loss": 3.0619993209838867, + "epoch": 1.29, + "learning_rate": 4.081028861248642e-05, + "loss": 105.7065, + "step": 1522, + "task_loss": 2.0704619884490967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9547366860602291, + "compression/movement_sparsity/importance_threshold": -0.001659024486193505, + "compression/movement_sparsity/linear_layer_sparsity": 0.5334817510630634, + "compression/movement_sparsity/model_sparsity": 0.5151550117322181, + "compression_loss": 102.7176742553711, + "distillation_loss": 3.695098876953125, + "epoch": 1.29, + "learning_rate": 4.0804250694360586e-05, + "loss": 105.8055, + "step": 1523, + "task_loss": 2.4045908451080322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9566702421068347, + "compression/movement_sparsity/importance_threshold": -0.001653141426129243, + "compression/movement_sparsity/linear_layer_sparsity": 0.5353800785506928, + "compression/movement_sparsity/model_sparsity": 0.516988125830709, + "compression_loss": 102.92491912841797, + "distillation_loss": 3.8236589431762695, + "epoch": 1.29, + "learning_rate": 4.079821277623475e-05, + "loss": 105.8456, + "step": 1524, + "task_loss": 1.9850085973739624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9585992216915636, + "compression/movement_sparsity/importance_threshold": -0.001647272290460992, + "compression/movement_sparsity/linear_layer_sparsity": 0.5371247035174959, + "compression/movement_sparsity/model_sparsity": 0.5186728175628083, + "compression_loss": 103.13175964355469, + "distillation_loss": 1.726851224899292, + "epoch": 1.29, + "learning_rate": 4.079217485810893e-05, + "loss": 105.0065, + "step": 1525, + "task_loss": 0.6944738030433655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9605236302367641, + "compression/movement_sparsity/importance_threshold": -0.0016414170626906522, + "compression/movement_sparsity/linear_layer_sparsity": 0.5389055898780797, + "compression/movement_sparsity/model_sparsity": 0.5203925249982587, + "compression_loss": 103.33800506591797, + "distillation_loss": 2.450979471206665, + "epoch": 1.29, + "learning_rate": 4.0786136939983095e-05, + "loss": 105.5405, + "step": 1526, + "task_loss": 1.497849702835083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9624434731647843, + "compression/movement_sparsity/importance_threshold": -0.0016355757263201245, + "compression/movement_sparsity/linear_layer_sparsity": 0.5406081225331282, + "compression/movement_sparsity/model_sparsity": 0.5220365704190032, + "compression_loss": 103.54375457763672, + "distillation_loss": 4.790939807891846, + "epoch": 1.29, + "learning_rate": 4.078009902185727e-05, + "loss": 106.9204, + "step": 1527, + "task_loss": 2.5016121864318848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9643587558979725, + "compression/movement_sparsity/importance_threshold": -0.0016297482648513083, + "compression/movement_sparsity/linear_layer_sparsity": 0.5422847201235688, + "compression/movement_sparsity/model_sparsity": 0.5236555717243947, + "compression_loss": 103.74905395507812, + "distillation_loss": 1.872643232345581, + "epoch": 1.29, + "learning_rate": 4.0774061103731436e-05, + "loss": 106.8003, + "step": 1528, + "task_loss": 0.9709722399711609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9662694838586766, + "compression/movement_sparsity/importance_threshold": -0.001623934661786106, + "compression/movement_sparsity/linear_layer_sparsity": 0.5441093324068732, + "compression/movement_sparsity/model_sparsity": 0.5254175029626037, + "compression_loss": 103.9538345336914, + "distillation_loss": 2.067540168762207, + "epoch": 1.29, + "learning_rate": 4.07680231856056e-05, + "loss": 106.312, + "step": 1529, + "task_loss": 2.535637617111206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9681756624692455, + "compression/movement_sparsity/importance_threshold": -0.0016181349006264157, + "compression/movement_sparsity/linear_layer_sparsity": 0.5459100725065706, + "compression/movement_sparsity/model_sparsity": 0.527156382100152, + "compression_loss": 104.1581039428711, + "distillation_loss": 3.6433424949645996, + "epoch": 1.29, + "learning_rate": 4.076198526747978e-05, + "loss": 107.3289, + "step": 1530, + "task_loss": 1.8894073963165283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9700772971520265, + "compression/movement_sparsity/importance_threshold": -0.0016123489648741393, + "compression/movement_sparsity/linear_layer_sparsity": 0.5476512394647595, + "compression/movement_sparsity/model_sparsity": 0.5288377346168709, + "compression_loss": 104.36184692382812, + "distillation_loss": 3.907491683959961, + "epoch": 1.29, + "learning_rate": 4.0755947349353944e-05, + "loss": 107.7219, + "step": 1531, + "task_loss": 1.3845911026000977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9719743933293683, + "compression/movement_sparsity/importance_threshold": -0.0016065768380311776, + "compression/movement_sparsity/linear_layer_sparsity": 0.5495527745534828, + "compression/movement_sparsity/model_sparsity": 0.5306739461254906, + "compression_loss": 104.56517791748047, + "distillation_loss": 3.3820080757141113, + "epoch": 1.29, + "learning_rate": 4.074990943122812e-05, + "loss": 107.3892, + "step": 1532, + "task_loss": 1.8725686073303223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9738669564236192, + "compression/movement_sparsity/importance_threshold": -0.0016008185035994295, + "compression/movement_sparsity/linear_layer_sparsity": 0.5511934446268365, + "compression/movement_sparsity/model_sparsity": 0.532258254134533, + "compression_loss": 104.76799011230469, + "distillation_loss": 2.408450126647949, + "epoch": 1.3, + "learning_rate": 4.0743871513102285e-05, + "loss": 107.5906, + "step": 1533, + "task_loss": 1.7001370191574097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9757549918571271, + "compression/movement_sparsity/importance_threshold": -0.0015950739450807968, + "compression/movement_sparsity/linear_layer_sparsity": 0.5529400370812996, + "compression/movement_sparsity/model_sparsity": 0.5339448457650383, + "compression_loss": 104.97030639648438, + "distillation_loss": 2.979132652282715, + "epoch": 1.3, + "learning_rate": 4.073783359497645e-05, + "loss": 107.9387, + "step": 1534, + "task_loss": 2.4278464317321777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9776385050522403, + "compression/movement_sparsity/importance_threshold": -0.00158934314597718, + "compression/movement_sparsity/linear_layer_sparsity": 0.5546922100462162, + "compression/movement_sparsity/model_sparsity": 0.5356368261982956, + "compression_loss": 105.17210388183594, + "distillation_loss": 2.3510773181915283, + "epoch": 1.3, + "learning_rate": 4.0731795676850626e-05, + "loss": 107.4359, + "step": 1535, + "task_loss": 1.115848183631897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9795175014313072, + "compression/movement_sparsity/importance_threshold": -0.0015836260897904779, + "compression/movement_sparsity/linear_layer_sparsity": 0.5564566410554624, + "compression/movement_sparsity/model_sparsity": 0.5373406435743495, + "compression_loss": 105.37339782714844, + "distillation_loss": 3.2075424194335938, + "epoch": 1.3, + "learning_rate": 4.0725757758724793e-05, + "loss": 108.1529, + "step": 1536, + "task_loss": 2.3909096717834473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9813919864166756, + "compression/movement_sparsity/importance_threshold": -0.0015779227600225933, + "compression/movement_sparsity/linear_layer_sparsity": 0.5581915955223129, + "compression/movement_sparsity/model_sparsity": 0.5390159970179195, + "compression_loss": 105.57422637939453, + "distillation_loss": 4.313943386077881, + "epoch": 1.3, + "learning_rate": 4.071971984059896e-05, + "loss": 108.7084, + "step": 1537, + "task_loss": 2.286760091781616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9832619654306939, + "compression/movement_sparsity/importance_threshold": -0.0015722331401754252, + "compression/movement_sparsity/linear_layer_sparsity": 0.5599624059612444, + "compression/movement_sparsity/model_sparsity": 0.5407259746706236, + "compression_loss": 105.77454376220703, + "distillation_loss": 4.445111274719238, + "epoch": 1.3, + "learning_rate": 4.0713681922473135e-05, + "loss": 109.1375, + "step": 1538, + "task_loss": 2.6167187690734863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9851274438957104, + "compression/movement_sparsity/importance_threshold": -0.0015665572137508739, + "compression/movement_sparsity/linear_layer_sparsity": 0.5615362291508313, + "compression/movement_sparsity/model_sparsity": 0.5422457321920018, + "compression_loss": 105.97439575195312, + "distillation_loss": 2.8024816513061523, + "epoch": 1.3, + "learning_rate": 4.07076440043473e-05, + "loss": 109.2061, + "step": 1539, + "task_loss": 1.3571248054504395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.986988427234073, + "compression/movement_sparsity/importance_threshold": -0.0015608949642508412, + "compression/movement_sparsity/linear_layer_sparsity": 0.5631657858999484, + "compression/movement_sparsity/model_sparsity": 0.5438193086536838, + "compression_loss": 106.1738052368164, + "distillation_loss": 4.957732200622559, + "epoch": 1.3, + "learning_rate": 4.070160608622147e-05, + "loss": 109.9056, + "step": 1540, + "task_loss": 2.3707399368286133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9888449208681303, + "compression/movement_sparsity/importance_threshold": -0.0015552463751772262, + "compression/movement_sparsity/linear_layer_sparsity": 0.564945599085445, + "compression/movement_sparsity/model_sparsity": 0.5455379797809127, + "compression_loss": 106.37268829345703, + "distillation_loss": 2.8952476978302, + "epoch": 1.3, + "learning_rate": 4.069556816809564e-05, + "loss": 109.2616, + "step": 1541, + "task_loss": 1.3892970085144043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9906969302202301, + "compression/movement_sparsity/importance_threshold": -0.00154961143003193, + "compression/movement_sparsity/linear_layer_sparsity": 0.5665855894812434, + "compression/movement_sparsity/model_sparsity": 0.5471216314614149, + "compression_loss": 106.57115173339844, + "distillation_loss": 3.9397149085998535, + "epoch": 1.3, + "learning_rate": 4.068953024996982e-05, + "loss": 110.1103, + "step": 1542, + "task_loss": 2.8137640953063965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9925444607127207, + "compression/movement_sparsity/importance_threshold": -0.0015439901123168532, + "compression/movement_sparsity/linear_layer_sparsity": 0.5682454455403232, + "compression/movement_sparsity/model_sparsity": 0.5487244663585508, + "compression_loss": 106.76905822753906, + "distillation_loss": 5.813114166259766, + "epoch": 1.3, + "learning_rate": 4.0683492331843984e-05, + "loss": 110.4854, + "step": 1543, + "task_loss": 2.9105277061462402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9943875177679506, + "compression/movement_sparsity/importance_threshold": -0.0015383824055338956, + "compression/movement_sparsity/linear_layer_sparsity": 0.5698902771481817, + "compression/movement_sparsity/model_sparsity": 0.5503127929405854, + "compression_loss": 106.96646881103516, + "distillation_loss": 3.403697967529297, + "epoch": 1.3, + "learning_rate": 4.067745441371815e-05, + "loss": 109.8051, + "step": 1544, + "task_loss": 2.2851877212524414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9962261068082677, + "compression/movement_sparsity/importance_threshold": -0.0015327882931849579, + "compression/movement_sparsity/linear_layer_sparsity": 0.5714345045536965, + "compression/movement_sparsity/model_sparsity": 0.5518039713841217, + "compression_loss": 107.16344451904297, + "distillation_loss": 3.2117934226989746, + "epoch": 1.31, + "learning_rate": 4.0671416495592325e-05, + "loss": 110.1169, + "step": 1545, + "task_loss": 1.3985012769699097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.99806023325602, + "compression/movement_sparsity/importance_threshold": -0.0015272077587719416, + "compression/movement_sparsity/linear_layer_sparsity": 0.5729264921807986, + "compression/movement_sparsity/model_sparsity": 0.553244704646342, + "compression_loss": 107.35993957519531, + "distillation_loss": 3.835989475250244, + "epoch": 1.31, + "learning_rate": 4.066537857746649e-05, + "loss": 110.8416, + "step": 1546, + "task_loss": 2.1488330364227295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998899025335564, + "compression/movement_sparsity/importance_threshold": -0.0015216407857967453, + "compression/movement_sparsity/linear_layer_sparsity": 0.574570775276946, + "compression/movement_sparsity/model_sparsity": 0.5548325015597302, + "compression_loss": 107.55591583251953, + "distillation_loss": 4.261700630187988, + "epoch": 1.31, + "learning_rate": 4.065934065934066e-05, + "loss": 111.0396, + "step": 1547, + "task_loss": 2.754579544067383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0017151200632244, + "compression/movement_sparsity/importance_threshold": -0.0015160873577612704, + "compression/movement_sparsity/linear_layer_sparsity": 0.5763561212762255, + "compression/movement_sparsity/model_sparsity": 0.5565565154315678, + "compression_loss": 107.75151062011719, + "distillation_loss": 4.171040058135986, + "epoch": 1.31, + "learning_rate": 4.0653302741214834e-05, + "loss": 111.2257, + "step": 1548, + "task_loss": 3.101100444793701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0035358912673722, + "compression/movement_sparsity/importance_threshold": -0.0015105474581674186, + "compression/movement_sparsity/linear_layer_sparsity": 0.5780150949469003, + "compression/movement_sparsity/model_sparsity": 0.558158498253055, + "compression_loss": 107.94657135009766, + "distillation_loss": 3.8038101196289062, + "epoch": 1.31, + "learning_rate": 4.0647264823089e-05, + "loss": 111.2947, + "step": 1549, + "task_loss": 1.986894130706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0053522215683484, + "compression/movement_sparsity/importance_threshold": -0.0015050210705170887, + "compression/movement_sparsity/linear_layer_sparsity": 0.5796629075966678, + "compression/movement_sparsity/model_sparsity": 0.5597497034690383, + "compression_loss": 108.14114379882812, + "distillation_loss": 3.1140084266662598, + "epoch": 1.31, + "learning_rate": 4.064122690496317e-05, + "loss": 111.0902, + "step": 1550, + "task_loss": 1.6885912418365479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0071641163885015, + "compression/movement_sparsity/importance_threshold": -0.0014995081783121802, + "compression/movement_sparsity/linear_layer_sparsity": 0.5813403875755133, + "compression/movement_sparsity/model_sparsity": 0.5613695568500785, + "compression_loss": 108.33525085449219, + "distillation_loss": 4.062110900878906, + "epoch": 1.31, + "learning_rate": 4.063518898683734e-05, + "loss": 112.0002, + "step": 1551, + "task_loss": 1.5930944681167603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0089715811501785, + "compression/movement_sparsity/importance_threshold": -0.0014940087650545972, + "compression/movement_sparsity/linear_layer_sparsity": 0.5830102599354072, + "compression/movement_sparsity/model_sparsity": 0.5629820639572818, + "compression_loss": 108.52886962890625, + "distillation_loss": 3.835289239883423, + "epoch": 1.31, + "learning_rate": 4.0629151068711516e-05, + "loss": 111.6382, + "step": 1552, + "task_loss": 1.416469693183899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.010774621275729, + "compression/movement_sparsity/importance_threshold": -0.001488522814246236, + "compression/movement_sparsity/linear_layer_sparsity": 0.5846757442016112, + "compression/movement_sparsity/model_sparsity": 0.5645903337153128, + "compression_loss": 108.72208404541016, + "distillation_loss": 3.3042421340942383, + "epoch": 1.31, + "learning_rate": 4.0623113150585676e-05, + "loss": 111.8082, + "step": 1553, + "task_loss": 2.028383255004883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0125732421875, + "compression/movement_sparsity/importance_threshold": -0.0014830503093890002, + "compression/movement_sparsity/linear_layer_sparsity": 0.5862880228318238, + "compression/movement_sparsity/model_sparsity": 0.5661472256146284, + "compression_loss": 108.9146728515625, + "distillation_loss": 2.4580612182617188, + "epoch": 1.31, + "learning_rate": 4.061707523245985e-05, + "loss": 111.8634, + "step": 1554, + "task_loss": 2.290478229522705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0143674493078403, + "compression/movement_sparsity/importance_threshold": -0.0014775912339847893, + "compression/movement_sparsity/linear_layer_sparsity": 0.5877998403497043, + "compression/movement_sparsity/model_sparsity": 0.567607107549875, + "compression_loss": 109.10681915283203, + "distillation_loss": 2.4995217323303223, + "epoch": 1.31, + "learning_rate": 4.0611037314334024e-05, + "loss": 111.6819, + "step": 1555, + "task_loss": 0.9833770990371704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0161572480590984, + "compression/movement_sparsity/importance_threshold": -0.0014721455715355023, + "compression/movement_sparsity/linear_layer_sparsity": 0.5895517032862625, + "compression/movement_sparsity/model_sparsity": 0.5692987886052016, + "compression_loss": 109.2984848022461, + "distillation_loss": 4.18853235244751, + "epoch": 1.32, + "learning_rate": 4.0604999396208185e-05, + "loss": 112.338, + "step": 1556, + "task_loss": 2.8779873847961426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0179426438636214, + "compression/movement_sparsity/importance_threshold": -0.001466713305543042, + "compression/movement_sparsity/linear_layer_sparsity": 0.5911535482697937, + "compression/movement_sparsity/model_sparsity": 0.5708456052856969, + "compression_loss": 109.48965454101562, + "distillation_loss": 4.407125473022461, + "epoch": 1.32, + "learning_rate": 4.059896147808236e-05, + "loss": 113.0236, + "step": 1557, + "task_loss": 2.585376024246216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0197236421437588, + "compression/movement_sparsity/importance_threshold": -0.0014612944195093068, + "compression/movement_sparsity/linear_layer_sparsity": 0.5929016550935465, + "compression/movement_sparsity/model_sparsity": 0.5725336592622483, + "compression_loss": 109.68032836914062, + "distillation_loss": 3.2936229705810547, + "epoch": 1.32, + "learning_rate": 4.059292355995653e-05, + "loss": 112.6751, + "step": 1558, + "task_loss": 1.312962532043457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0215002483218583, + "compression/movement_sparsity/importance_threshold": -0.0014558888969361983, + "compression/movement_sparsity/linear_layer_sparsity": 0.5946845804867961, + "compression/movement_sparsity/model_sparsity": 0.5742553356833195, + "compression_loss": 109.87052917480469, + "distillation_loss": 2.9357213973999023, + "epoch": 1.32, + "learning_rate": 4.05868856418307e-05, + "loss": 113.0302, + "step": 1559, + "task_loss": 2.1829209327697754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0232724678202672, + "compression/movement_sparsity/importance_threshold": -0.001450496721325618, + "compression/movement_sparsity/linear_layer_sparsity": 0.5964785834317793, + "compression/movement_sparsity/model_sparsity": 0.575987709108144, + "compression_loss": 110.0603256225586, + "distillation_loss": 4.300307273864746, + "epoch": 1.32, + "learning_rate": 4.058084772370487e-05, + "loss": 113.8539, + "step": 1560, + "task_loss": 3.6302967071533203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0250403060613351, + "compression/movement_sparsity/importance_threshold": -0.0014451178761794637, + "compression/movement_sparsity/linear_layer_sparsity": 0.5979925472998342, + "compression/movement_sparsity/model_sparsity": 0.5774496636598337, + "compression_loss": 110.24958801269531, + "distillation_loss": 4.431704521179199, + "epoch": 1.32, + "learning_rate": 4.057480980557904e-05, + "loss": 113.5981, + "step": 1561, + "task_loss": 2.741734504699707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0268037684674094, + "compression/movement_sparsity/importance_threshold": -0.0014397523449996377, + "compression/movement_sparsity/linear_layer_sparsity": 0.5994690096606744, + "compression/movement_sparsity/model_sparsity": 0.5788754049964495, + "compression_loss": 110.43843841552734, + "distillation_loss": 2.4832940101623535, + "epoch": 1.32, + "learning_rate": 4.056877188745321e-05, + "loss": 113.3837, + "step": 1562, + "task_loss": 1.6893428564071655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0285628604608383, + "compression/movement_sparsity/importance_threshold": -0.0014344001112880406, + "compression/movement_sparsity/linear_layer_sparsity": 0.6010364057239057, + "compression/movement_sparsity/model_sparsity": 0.5803889561830345, + "compression_loss": 110.62679290771484, + "distillation_loss": 1.991780161857605, + "epoch": 1.32, + "learning_rate": 4.0562733969327375e-05, + "loss": 114.1848, + "step": 1563, + "task_loss": 1.2107614278793335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0303175874639705, + "compression/movement_sparsity/importance_threshold": -0.0014290611585465714, + "compression/movement_sparsity/linear_layer_sparsity": 0.602795661644398, + "compression/movement_sparsity/model_sparsity": 0.5820877762505537, + "compression_loss": 110.81462860107422, + "distillation_loss": 2.226391077041626, + "epoch": 1.32, + "learning_rate": 4.055669605120155e-05, + "loss": 114.0549, + "step": 1564, + "task_loss": 0.8832390904426575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0320679548991538, + "compression/movement_sparsity/importance_threshold": -0.001423735470277132, + "compression/movement_sparsity/linear_layer_sparsity": 0.6043786902913999, + "compression/movement_sparsity/model_sparsity": 0.5836164229935654, + "compression_loss": 111.00200653076172, + "distillation_loss": 2.9719977378845215, + "epoch": 1.32, + "learning_rate": 4.0550658133075716e-05, + "loss": 113.6935, + "step": 1565, + "task_loss": 1.253769874572754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0338139681887362, + "compression/movement_sparsity/importance_threshold": -0.001418423029981623, + "compression/movement_sparsity/linear_layer_sparsity": 0.6061804081728436, + "compression/movement_sparsity/model_sparsity": 0.585356246323049, + "compression_loss": 111.18893432617188, + "distillation_loss": 3.539640426635742, + "epoch": 1.32, + "learning_rate": 4.0544620214949883e-05, + "loss": 114.8504, + "step": 1566, + "task_loss": 1.5691958665847778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0355556327550666, + "compression/movement_sparsity/importance_threshold": -0.0014131238211619428, + "compression/movement_sparsity/linear_layer_sparsity": 0.6077575343568657, + "compression/movement_sparsity/model_sparsity": 0.5868791933708423, + "compression_loss": 111.37535095214844, + "distillation_loss": 5.6533203125, + "epoch": 1.32, + "learning_rate": 4.053858229682406e-05, + "loss": 114.7147, + "step": 1567, + "task_loss": 2.867809772491455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0372929540204925, + "compression/movement_sparsity/importance_threshold": -0.001407837827319994, + "compression/movement_sparsity/linear_layer_sparsity": 0.6093499830963, + "compression/movement_sparsity/model_sparsity": 0.5884169365971316, + "compression_loss": 111.56130981445312, + "distillation_loss": 3.4040427207946777, + "epoch": 1.33, + "learning_rate": 4.053254437869823e-05, + "loss": 114.4516, + "step": 1568, + "task_loss": 2.158994674682617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.039025937407362, + "compression/movement_sparsity/importance_threshold": -0.001402565031957677, + "compression/movement_sparsity/linear_layer_sparsity": 0.6110209047829458, + "compression/movement_sparsity/model_sparsity": 0.5900304569834848, + "compression_loss": 111.74673461914062, + "distillation_loss": 2.1351006031036377, + "epoch": 1.33, + "learning_rate": 4.052650646057239e-05, + "loss": 115.3035, + "step": 1569, + "task_loss": 1.0629807710647583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0407545883380243, + "compression/movement_sparsity/importance_threshold": -0.0013973054185768908, + "compression/movement_sparsity/linear_layer_sparsity": 0.6126326587497825, + "compression/movement_sparsity/model_sparsity": 0.5915868422432254, + "compression_loss": 111.93167877197266, + "distillation_loss": 4.234724998474121, + "epoch": 1.33, + "learning_rate": 4.0520468542446566e-05, + "loss": 116.3498, + "step": 1570, + "task_loss": 2.031952142715454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0424789122348264, + "compression/movement_sparsity/importance_threshold": -0.001392058970679537, + "compression/movement_sparsity/linear_layer_sparsity": 0.6140838895719052, + "compression/movement_sparsity/model_sparsity": 0.5929882188221001, + "compression_loss": 112.11622619628906, + "distillation_loss": 4.4984283447265625, + "epoch": 1.33, + "learning_rate": 4.051443062432074e-05, + "loss": 115.949, + "step": 1571, + "task_loss": 2.239861011505127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0441989145201171, + "compression/movement_sparsity/importance_threshold": -0.0013868256717675157, + "compression/movement_sparsity/linear_layer_sparsity": 0.615532759408836, + "compression/movement_sparsity/model_sparsity": 0.5943873155228874, + "compression_loss": 112.30032348632812, + "distillation_loss": 4.103102684020996, + "epoch": 1.33, + "learning_rate": 4.050839270619491e-05, + "loss": 116.7342, + "step": 1572, + "task_loss": 2.0543394088745117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.045914600616245, + "compression/movement_sparsity/importance_threshold": -0.0013816055053427262, + "compression/movement_sparsity/linear_layer_sparsity": 0.61706463337668, + "compression/movement_sparsity/model_sparsity": 0.5958665649073404, + "compression_loss": 112.48394775390625, + "distillation_loss": 3.718435525894165, + "epoch": 1.33, + "learning_rate": 4.0502354788069074e-05, + "loss": 117.0921, + "step": 1573, + "task_loss": 2.3761038780212402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0476259759455573, + "compression/movement_sparsity/importance_threshold": -0.0013763984549070717, + "compression/movement_sparsity/linear_layer_sparsity": 0.6187296645245138, + "compression/movement_sparsity/model_sparsity": 0.5974743971130111, + "compression_loss": 112.66709899902344, + "distillation_loss": 4.060519695281982, + "epoch": 1.33, + "learning_rate": 4.049631686994325e-05, + "loss": 115.3242, + "step": 1574, + "task_loss": 1.791649580001831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0493330459304029, + "compression/movement_sparsity/importance_threshold": -0.00137120450396245, + "compression/movement_sparsity/linear_layer_sparsity": 0.6203383778286033, + "compression/movement_sparsity/model_sparsity": 0.5990278461661241, + "compression_loss": 112.84989166259766, + "distillation_loss": 4.050065994262695, + "epoch": 1.33, + "learning_rate": 4.0490278951817415e-05, + "loss": 117.2844, + "step": 1575, + "task_loss": 1.2940340042114258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0510358159931297, + "compression/movement_sparsity/importance_threshold": -0.0013660236360107632, + "compression/movement_sparsity/linear_layer_sparsity": 0.6218575764062505, + "compression/movement_sparsity/model_sparsity": 0.6004948555990277, + "compression_loss": 113.03218078613281, + "distillation_loss": 3.8653383255004883, + "epoch": 1.33, + "learning_rate": 4.048424103369158e-05, + "loss": 116.1084, + "step": 1576, + "task_loss": 1.5775467157363892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.052734291556086, + "compression/movement_sparsity/importance_threshold": -0.0013608558345539114, + "compression/movement_sparsity/linear_layer_sparsity": 0.6232577360183887, + "compression/movement_sparsity/model_sparsity": 0.6018469154210943, + "compression_loss": 113.21402740478516, + "distillation_loss": 3.3859777450561523, + "epoch": 1.33, + "learning_rate": 4.0478203115565756e-05, + "loss": 117.2089, + "step": 1577, + "task_loss": 1.5121915340423584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0544284780416202, + "compression/movement_sparsity/importance_threshold": -0.001355701083093794, + "compression/movement_sparsity/linear_layer_sparsity": 0.624745526338483, + "compression/movement_sparsity/model_sparsity": 0.6032835955667148, + "compression_loss": 113.39543151855469, + "distillation_loss": 3.930778741836548, + "epoch": 1.33, + "learning_rate": 4.0472165197439924e-05, + "loss": 117.133, + "step": 1578, + "task_loss": 2.0993504524230957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0561183808720802, + "compression/movement_sparsity/importance_threshold": -0.0013505593651323127, + "compression/movement_sparsity/linear_layer_sparsity": 0.626201693766007, + "compression/movement_sparsity/model_sparsity": 0.6046897391634084, + "compression_loss": 113.57635498046875, + "distillation_loss": 3.901603937149048, + "epoch": 1.33, + "learning_rate": 4.046612727931409e-05, + "loss": 116.7314, + "step": 1579, + "task_loss": 2.076174259185791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0578040054698143, + "compression/movement_sparsity/importance_threshold": -0.0013454306641713684, + "compression/movement_sparsity/linear_layer_sparsity": 0.627513709930981, + "compression/movement_sparsity/model_sparsity": 0.6059566835368817, + "compression_loss": 113.75682067871094, + "distillation_loss": 5.026679039001465, + "epoch": 1.34, + "learning_rate": 4.0460089361188265e-05, + "loss": 118.5587, + "step": 1580, + "task_loss": 1.9855003356933594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0594853572571705, + "compression/movement_sparsity/importance_threshold": -0.00134031496371286, + "compression/movement_sparsity/linear_layer_sparsity": 0.629108293096422, + "compression/movement_sparsity/model_sparsity": 0.6074964878650783, + "compression_loss": 113.93682861328125, + "distillation_loss": 2.958275318145752, + "epoch": 1.34, + "learning_rate": 4.045405144306243e-05, + "loss": 117.3651, + "step": 1581, + "task_loss": 1.7174173593521118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0611624416564975, + "compression/movement_sparsity/importance_threshold": -0.001335212247258689, + "compression/movement_sparsity/linear_layer_sparsity": 0.6307970175653482, + "compression/movement_sparsity/model_sparsity": 0.6091271994533728, + "compression_loss": 114.1164321899414, + "distillation_loss": 4.417238235473633, + "epoch": 1.34, + "learning_rate": 4.0448013524936606e-05, + "loss": 118.0716, + "step": 1582, + "task_loss": 2.5637683868408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0628352640901428, + "compression/movement_sparsity/importance_threshold": -0.001330122498310756, + "compression/movement_sparsity/linear_layer_sparsity": 0.6321590079168842, + "compression/movement_sparsity/model_sparsity": 0.610442401246361, + "compression_loss": 114.29553985595703, + "distillation_loss": 2.7364110946655273, + "epoch": 1.34, + "learning_rate": 4.044197560681077e-05, + "loss": 117.8519, + "step": 1583, + "task_loss": 1.3970770835876465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0645038299804552, + "compression/movement_sparsity/importance_threshold": -0.0013250457003709604, + "compression/movement_sparsity/linear_layer_sparsity": 0.6334853092346859, + "compression/movement_sparsity/model_sparsity": 0.6117231400337162, + "compression_loss": 114.47422790527344, + "distillation_loss": 3.7062671184539795, + "epoch": 1.34, + "learning_rate": 4.043593768868495e-05, + "loss": 118.2151, + "step": 1584, + "task_loss": 3.64023756980896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0661681447497828, + "compression/movement_sparsity/importance_threshold": -0.001319981836941203, + "compression/movement_sparsity/linear_layer_sparsity": 0.6349112012005824, + "compression/movement_sparsity/model_sparsity": 0.6131000482240274, + "compression_loss": 114.65242767333984, + "distillation_loss": 4.417819976806641, + "epoch": 1.34, + "learning_rate": 4.0429899770559114e-05, + "loss": 118.1918, + "step": 1585, + "task_loss": 2.5193395614624023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0678282138204733, + "compression/movement_sparsity/importance_threshold": -0.0013149308915233857, + "compression/movement_sparsity/linear_layer_sparsity": 0.6364070403338309, + "compression/movement_sparsity/model_sparsity": 0.6145445006813093, + "compression_loss": 114.83014678955078, + "distillation_loss": 5.502124786376953, + "epoch": 1.34, + "learning_rate": 4.042386185243328e-05, + "loss": 118.9, + "step": 1586, + "task_loss": 2.3925704956054688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0694840426148755, + "compression/movement_sparsity/importance_threshold": -0.0013098928476194065, + "compression/movement_sparsity/linear_layer_sparsity": 0.6378704099586069, + "compression/movement_sparsity/model_sparsity": 0.6159575990576227, + "compression_loss": 115.00747680664062, + "distillation_loss": 3.6127185821533203, + "epoch": 1.34, + "learning_rate": 4.0417823934307455e-05, + "loss": 118.0272, + "step": 1587, + "task_loss": 1.549818992614746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0711356365553373, + "compression/movement_sparsity/importance_threshold": -0.001304867688731168, + "compression/movement_sparsity/linear_layer_sparsity": 0.6392611614024806, + "compression/movement_sparsity/model_sparsity": 0.6173005739109474, + "compression_loss": 115.18421936035156, + "distillation_loss": 3.0144705772399902, + "epoch": 1.34, + "learning_rate": 4.041178601618162e-05, + "loss": 118.5309, + "step": 1588, + "task_loss": 1.477476954460144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0727830010642068, + "compression/movement_sparsity/importance_threshold": -0.0012998553983605702, + "compression/movement_sparsity/linear_layer_sparsity": 0.6407908890201501, + "compression/movement_sparsity/model_sparsity": 0.6187777506789575, + "compression_loss": 115.36055755615234, + "distillation_loss": 4.683080673217773, + "epoch": 1.34, + "learning_rate": 4.040574809805579e-05, + "loss": 119.6105, + "step": 1589, + "task_loss": 1.9613977670669556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.074426141563833, + "compression/movement_sparsity/importance_threshold": -0.0012948559600095113, + "compression/movement_sparsity/linear_layer_sparsity": 0.6422824116047144, + "compression/movement_sparsity/model_sparsity": 0.6202180348742818, + "compression_loss": 115.5364761352539, + "distillation_loss": 4.053908348083496, + "epoch": 1.34, + "learning_rate": 4.0399710179929964e-05, + "loss": 119.3435, + "step": 1590, + "task_loss": 2.8004140853881836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0760650634765625, + "compression/movement_sparsity/importance_threshold": -0.0012898693571798958, + "compression/movement_sparsity/linear_layer_sparsity": 0.6436950796687028, + "compression/movement_sparsity/model_sparsity": 0.6215821734443968, + "compression_loss": 115.71187591552734, + "distillation_loss": 3.4829654693603516, + "epoch": 1.34, + "learning_rate": 4.039367226180413e-05, + "loss": 119.6533, + "step": 1591, + "task_loss": 2.3413448333740234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0776997722247448, + "compression/movement_sparsity/importance_threshold": -0.0012848955733736215, + "compression/movement_sparsity/linear_layer_sparsity": 0.6452024971687257, + "compression/movement_sparsity/model_sparsity": 0.6230378065159353, + "compression_loss": 115.8868179321289, + "distillation_loss": 4.101276874542236, + "epoch": 1.35, + "learning_rate": 4.03876343436783e-05, + "loss": 119.8203, + "step": 1592, + "task_loss": 2.8775336742401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.079330273230728, + "compression/movement_sparsity/importance_threshold": -0.0012799345920925884, + "compression/movement_sparsity/linear_layer_sparsity": 0.6465461958471082, + "compression/movement_sparsity/model_sparsity": 0.6243353450110147, + "compression_loss": 116.0613021850586, + "distillation_loss": 4.3211212158203125, + "epoch": 1.35, + "learning_rate": 4.038159642555247e-05, + "loss": 120.0946, + "step": 1593, + "task_loss": 2.5605499744415283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0809565719168597, + "compression/movement_sparsity/importance_threshold": -0.0012749863968386991, + "compression/movement_sparsity/linear_layer_sparsity": 0.6479110837713795, + "compression/movement_sparsity/model_sparsity": 0.625653344836201, + "compression_loss": 116.23534393310547, + "distillation_loss": 3.038931131362915, + "epoch": 1.35, + "learning_rate": 4.037555850742664e-05, + "loss": 120.7138, + "step": 1594, + "task_loss": 1.8257532119750977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0825786737054883, + "compression/movement_sparsity/importance_threshold": -0.0012700509711138525, + "compression/movement_sparsity/linear_layer_sparsity": 0.6494545480301657, + "compression/movement_sparsity/model_sparsity": 0.6271437863494465, + "compression_loss": 116.40900421142578, + "distillation_loss": 3.7389769554138184, + "epoch": 1.35, + "learning_rate": 4.036952058930081e-05, + "loss": 120.3522, + "step": 1595, + "task_loss": 2.2965049743652344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0841965840189625, + "compression/movement_sparsity/importance_threshold": -0.0012651282984199493, + "compression/movement_sparsity/linear_layer_sparsity": 0.6510160058579143, + "compression/movement_sparsity/model_sparsity": 0.6286516032972057, + "compression_loss": 116.58209228515625, + "distillation_loss": 4.013985633850098, + "epoch": 1.35, + "learning_rate": 4.036348267117498e-05, + "loss": 120.2463, + "step": 1596, + "task_loss": 1.764885425567627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0858103082796298, + "compression/movement_sparsity/importance_threshold": -0.0012602183622588909, + "compression/movement_sparsity/linear_layer_sparsity": 0.652394213077435, + "compression/movement_sparsity/model_sparsity": 0.6299824648588745, + "compression_loss": 116.75476837158203, + "distillation_loss": 3.6824915409088135, + "epoch": 1.35, + "learning_rate": 4.035744475304915e-05, + "loss": 120.4639, + "step": 1597, + "task_loss": 2.8522896766662598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0874198519098388, + "compression/movement_sparsity/importance_threshold": -0.0012553211461325759, + "compression/movement_sparsity/linear_layer_sparsity": 0.6536660328733085, + "compression/movement_sparsity/model_sparsity": 0.6312105937321845, + "compression_loss": 116.92699432373047, + "distillation_loss": 4.527782917022705, + "epoch": 1.35, + "learning_rate": 4.035140683492332e-05, + "loss": 120.5602, + "step": 1598, + "task_loss": 2.5846996307373047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0890252203319377, + "compression/movement_sparsity/importance_threshold": -0.0012504366335429062, + "compression/movement_sparsity/linear_layer_sparsity": 0.6551264691528461, + "compression/movement_sparsity/model_sparsity": 0.6326208595326924, + "compression_loss": 117.09871673583984, + "distillation_loss": 3.6654696464538574, + "epoch": 1.35, + "learning_rate": 4.034536891679749e-05, + "loss": 121.0475, + "step": 1599, + "task_loss": 3.141796112060547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0906264189682742, + "compression/movement_sparsity/importance_threshold": -0.001245564807991783, + "compression/movement_sparsity/linear_layer_sparsity": 0.6563391450772457, + "compression/movement_sparsity/model_sparsity": 0.6337918763084615, + "compression_loss": 117.27008056640625, + "distillation_loss": 4.536157608032227, + "epoch": 1.35, + "learning_rate": 4.033933099867166e-05, + "loss": 120.5534, + "step": 1600, + "task_loss": 2.339637279510498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0922234532411974, + "compression/movement_sparsity/importance_threshold": -0.001240705652981104, + "compression/movement_sparsity/linear_layer_sparsity": 0.6577863574548751, + "compression/movement_sparsity/model_sparsity": 0.6351893724887733, + "compression_loss": 117.44084930419922, + "distillation_loss": 5.000787734985352, + "epoch": 1.35, + "learning_rate": 4.033329308054583e-05, + "loss": 122.1345, + "step": 1601, + "task_loss": 2.2831034660339355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0938163285730549, + "compression/movement_sparsity/importance_threshold": -0.0012358591520127723, + "compression/movement_sparsity/linear_layer_sparsity": 0.6591165460513262, + "compression/movement_sparsity/model_sparsity": 0.6364738650147975, + "compression_loss": 117.61127471923828, + "distillation_loss": 3.8120532035827637, + "epoch": 1.35, + "learning_rate": 4.032725516242e-05, + "loss": 121.8695, + "step": 1602, + "task_loss": 2.370018482208252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0954050503861945, + "compression/movement_sparsity/importance_threshold": -0.0012310252885886878, + "compression/movement_sparsity/linear_layer_sparsity": 0.6605452998174554, + "compression/movement_sparsity/model_sparsity": 0.6378535366936994, + "compression_loss": 117.7812271118164, + "distillation_loss": 4.224034309387207, + "epoch": 1.35, + "learning_rate": 4.032121724429417e-05, + "loss": 122.1887, + "step": 1603, + "task_loss": 1.872103214263916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.096989624102965, + "compression/movement_sparsity/importance_threshold": -0.00122620404621075, + "compression/movement_sparsity/linear_layer_sparsity": 0.6619422160559967, + "compression/movement_sparsity/model_sparsity": 0.6392024645620298, + "compression_loss": 117.95074462890625, + "distillation_loss": 3.277021884918213, + "epoch": 1.36, + "learning_rate": 4.031517932616834e-05, + "loss": 122.2191, + "step": 1604, + "task_loss": 1.7506492137908936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0985700551457152, + "compression/movement_sparsity/importance_threshold": -0.0012213954083808592, + "compression/movement_sparsity/linear_layer_sparsity": 0.6632956925518405, + "compression/movement_sparsity/model_sparsity": 0.6405094449764608, + "compression_loss": 118.11978912353516, + "distillation_loss": 4.338192462921143, + "epoch": 1.36, + "learning_rate": 4.030914140804251e-05, + "loss": 121.9303, + "step": 1605, + "task_loss": 2.8371849060058594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1001463489367915, + "compression/movement_sparsity/importance_threshold": -0.0012165993586009181, + "compression/movement_sparsity/linear_layer_sparsity": 0.6647438230903779, + "compression/movement_sparsity/model_sparsity": 0.6419078277760288, + "compression_loss": 118.28837585449219, + "distillation_loss": 3.3354732990264893, + "epoch": 1.36, + "learning_rate": 4.030310348991668e-05, + "loss": 121.9622, + "step": 1606, + "task_loss": 1.886362075805664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.101718510898544, + "compression/movement_sparsity/importance_threshold": -0.0012118158803728243, + "compression/movement_sparsity/linear_layer_sparsity": 0.6660084049164965, + "compression/movement_sparsity/model_sparsity": 0.6431289673261115, + "compression_loss": 118.45646667480469, + "distillation_loss": 4.3371758460998535, + "epoch": 1.36, + "learning_rate": 4.0297065571790846e-05, + "loss": 122.3046, + "step": 1607, + "task_loss": 2.660590887069702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1032865464533197, + "compression/movement_sparsity/importance_threshold": -0.0012070449571984802, + "compression/movement_sparsity/linear_layer_sparsity": 0.6675098126360309, + "compression/movement_sparsity/model_sparsity": 0.6445787970716095, + "compression_loss": 118.62409973144531, + "distillation_loss": 4.052557945251465, + "epoch": 1.36, + "learning_rate": 4.029102765366502e-05, + "loss": 122.1051, + "step": 1608, + "task_loss": 2.041536331176758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1048504610234668, + "compression/movement_sparsity/importance_threshold": -0.001202286572579786, + "compression/movement_sparsity/linear_layer_sparsity": 0.6688460944821439, + "compression/movement_sparsity/model_sparsity": 0.6458691735254246, + "compression_loss": 118.79136657714844, + "distillation_loss": 4.164777755737305, + "epoch": 1.36, + "learning_rate": 4.028498973553919e-05, + "loss": 122.6175, + "step": 1609, + "task_loss": 2.1523187160491943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1064102600313346, + "compression/movement_sparsity/importance_threshold": -0.001197540710018641, + "compression/movement_sparsity/linear_layer_sparsity": 0.6703787435208842, + "compression/movement_sparsity/model_sparsity": 0.6473491713547044, + "compression_loss": 118.9581298828125, + "distillation_loss": 3.983055591583252, + "epoch": 1.36, + "learning_rate": 4.0278951817413355e-05, + "loss": 122.874, + "step": 1610, + "task_loss": 2.0215349197387695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1079659488992701, + "compression/movement_sparsity/importance_threshold": -0.001192807353016947, + "compression/movement_sparsity/linear_layer_sparsity": 0.6719127042180646, + "compression/movement_sparsity/model_sparsity": 0.6488304357829214, + "compression_loss": 119.1243896484375, + "distillation_loss": 6.733787536621094, + "epoch": 1.36, + "learning_rate": 4.027291389928753e-05, + "loss": 123.9715, + "step": 1611, + "task_loss": 4.168168067932129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1095175330496219, + "compression/movement_sparsity/importance_threshold": -0.001188086485076604, + "compression/movement_sparsity/linear_layer_sparsity": 0.673421087575666, + "compression/movement_sparsity/model_sparsity": 0.6502870015318593, + "compression_loss": 119.29032135009766, + "distillation_loss": 3.532302141189575, + "epoch": 1.36, + "learning_rate": 4.0266875981161696e-05, + "loss": 123.0577, + "step": 1612, + "task_loss": 3.5787298679351807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1110650179047383, + "compression/movement_sparsity/importance_threshold": -0.0011833780896995123, + "compression/movement_sparsity/linear_layer_sparsity": 0.6749048356029317, + "compression/movement_sparsity/model_sparsity": 0.6517197782498456, + "compression_loss": 119.45575714111328, + "distillation_loss": 2.62689208984375, + "epoch": 1.36, + "learning_rate": 4.026083806303586e-05, + "loss": 122.7526, + "step": 1613, + "task_loss": 1.0241925716400146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1126084088869674, + "compression/movement_sparsity/importance_threshold": -0.0011786821503875725, + "compression/movement_sparsity/linear_layer_sparsity": 0.6764263832416031, + "compression/movement_sparsity/model_sparsity": 0.6531890560463006, + "compression_loss": 119.62071990966797, + "distillation_loss": 3.7785258293151855, + "epoch": 1.36, + "learning_rate": 4.025480014491004e-05, + "loss": 123.8019, + "step": 1614, + "task_loss": 2.802356243133545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1141477114186573, + "compression/movement_sparsity/importance_threshold": -0.001173998650642685, + "compression/movement_sparsity/linear_layer_sparsity": 0.6779000076505459, + "compression/movement_sparsity/model_sparsity": 0.6546120569233973, + "compression_loss": 119.78531646728516, + "distillation_loss": 4.216497421264648, + "epoch": 1.36, + "learning_rate": 4.024876222678421e-05, + "loss": 123.9349, + "step": 1615, + "task_loss": 2.1455159187316895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1156829309221568, + "compression/movement_sparsity/importance_threshold": -0.00116932757396675, + "compression/movement_sparsity/linear_layer_sparsity": 0.6792506342703248, + "compression/movement_sparsity/model_sparsity": 0.6559162853637733, + "compression_loss": 119.94945526123047, + "distillation_loss": 2.9395642280578613, + "epoch": 1.37, + "learning_rate": 4.024272430865838e-05, + "loss": 124.6031, + "step": 1616, + "task_loss": 1.4227871894836426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.117214072819813, + "compression/movement_sparsity/importance_threshold": -0.0011646689038616689, + "compression/movement_sparsity/linear_layer_sparsity": 0.6805866537847499, + "compression/movement_sparsity/model_sparsity": 0.6572064084978011, + "compression_loss": 120.11316680908203, + "distillation_loss": 6.896747589111328, + "epoch": 1.37, + "learning_rate": 4.0236686390532545e-05, + "loss": 124.8741, + "step": 1617, + "task_loss": 3.7542102336883545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.118741142533975, + "compression/movement_sparsity/importance_threshold": -0.0011600226238293412, + "compression/movement_sparsity/linear_layer_sparsity": 0.6820786175635167, + "compression/movement_sparsity/model_sparsity": 0.6586471187309498, + "compression_loss": 120.27645111083984, + "distillation_loss": 5.756622314453125, + "epoch": 1.37, + "learning_rate": 4.023064847240672e-05, + "loss": 124.8108, + "step": 1618, + "task_loss": 2.2962188720703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1202641454869908, + "compression/movement_sparsity/importance_threshold": -0.0011553887173716675, + "compression/movement_sparsity/linear_layer_sparsity": 0.6834281113873703, + "compression/movement_sparsity/model_sparsity": 0.6599502532904254, + "compression_loss": 120.4393081665039, + "distillation_loss": 6.1587629318237305, + "epoch": 1.37, + "learning_rate": 4.0224610554280886e-05, + "loss": 124.2767, + "step": 1619, + "task_loss": 2.4162888526916504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1217830871012082, + "compression/movement_sparsity/importance_threshold": -0.0011507671679905486, + "compression/movement_sparsity/linear_layer_sparsity": 0.6848800338112158, + "compression/movement_sparsity/model_sparsity": 0.6613522977123761, + "compression_loss": 120.60175323486328, + "distillation_loss": 5.484973907470703, + "epoch": 1.37, + "learning_rate": 4.0218572636155054e-05, + "loss": 125.7443, + "step": 1620, + "task_loss": 2.0053982734680176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1232979727989758, + "compression/movement_sparsity/importance_threshold": -0.001146157959187885, + "compression/movement_sparsity/linear_layer_sparsity": 0.6860141533192304, + "compression/movement_sparsity/model_sparsity": 0.6624474567263307, + "compression_loss": 120.76371765136719, + "distillation_loss": 4.485365867614746, + "epoch": 1.37, + "learning_rate": 4.021253471802923e-05, + "loss": 125.1201, + "step": 1621, + "task_loss": 2.640087127685547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1248088080026417, + "compression/movement_sparsity/importance_threshold": -0.0011415610744655766, + "compression/movement_sparsity/linear_layer_sparsity": 0.6875402559899386, + "compression/movement_sparsity/model_sparsity": 0.6639211330754592, + "compression_loss": 120.92523956298828, + "distillation_loss": 4.5597920417785645, + "epoch": 1.37, + "learning_rate": 4.0206496799903395e-05, + "loss": 125.0075, + "step": 1622, + "task_loss": 2.3231732845306396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.126315598134554, + "compression/movement_sparsity/importance_threshold": -0.001136976497325525, + "compression/movement_sparsity/linear_layer_sparsity": 0.6889218377489003, + "compression/movement_sparsity/model_sparsity": 0.6652552532507578, + "compression_loss": 121.08631896972656, + "distillation_loss": 4.363131523132324, + "epoch": 1.37, + "learning_rate": 4.020045888177756e-05, + "loss": 125.8885, + "step": 1623, + "task_loss": 2.6924426555633545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.127818348617061, + "compression/movement_sparsity/importance_threshold": -0.001132404211269629, + "compression/movement_sparsity/linear_layer_sparsity": 0.6901701430861958, + "compression/movement_sparsity/model_sparsity": 0.666460675459481, + "compression_loss": 121.24699401855469, + "distillation_loss": 5.69732666015625, + "epoch": 1.37, + "learning_rate": 4.0194420963651736e-05, + "loss": 125.9492, + "step": 1624, + "task_loss": 3.3758645057678223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.129317064872511, + "compression/movement_sparsity/importance_threshold": -0.0011278441997997898, + "compression/movement_sparsity/linear_layer_sparsity": 0.6914210359678685, + "compression/movement_sparsity/model_sparsity": 0.6676685963224714, + "compression_loss": 121.40718078613281, + "distillation_loss": 3.309131145477295, + "epoch": 1.37, + "learning_rate": 4.018838304552591e-05, + "loss": 125.4112, + "step": 1625, + "task_loss": 1.5360796451568604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.130811752323252, + "compression/movement_sparsity/importance_threshold": -0.001123296446417909, + "compression/movement_sparsity/linear_layer_sparsity": 0.692645731453245, + "compression/movement_sparsity/model_sparsity": 0.6688512197503214, + "compression_loss": 121.56698608398438, + "distillation_loss": 4.673027992248535, + "epoch": 1.37, + "learning_rate": 4.018234512740007e-05, + "loss": 126.2882, + "step": 1626, + "task_loss": 1.935702919960022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1323024163916324, + "compression/movement_sparsity/importance_threshold": -0.0011187609346258848, + "compression/movement_sparsity/linear_layer_sparsity": 0.6940715041774652, + "compression/movement_sparsity/model_sparsity": 0.6702280127952746, + "compression_loss": 121.72631072998047, + "distillation_loss": 4.488462924957275, + "epoch": 1.38, + "learning_rate": 4.0176307209274244e-05, + "loss": 126.1917, + "step": 1627, + "task_loss": 2.3700146675109863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1337890625, + "compression/movement_sparsity/importance_threshold": -0.00111423764792562, + "compression/movement_sparsity/linear_layer_sparsity": 0.6952129928210787, + "compression/movement_sparsity/model_sparsity": 0.6713302877923502, + "compression_loss": 121.88512420654297, + "distillation_loss": 5.591846942901611, + "epoch": 1.38, + "learning_rate": 4.017026929114842e-05, + "loss": 126.0691, + "step": 1628, + "task_loss": 2.6662471294403076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1352716960707032, + "compression/movement_sparsity/importance_threshold": -0.0011097265698190141, + "compression/movement_sparsity/linear_layer_sparsity": 0.6965690687854672, + "compression/movement_sparsity/model_sparsity": 0.6726397783755843, + "compression_loss": 122.04363250732422, + "distillation_loss": 4.409328460693359, + "epoch": 1.38, + "learning_rate": 4.016423137302258e-05, + "loss": 126.66, + "step": 1629, + "task_loss": 2.1426570415496826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1367503225260904, + "compression/movement_sparsity/importance_threshold": -0.0011052276838079663, + "compression/movement_sparsity/linear_layer_sparsity": 0.6978997939694619, + "compression/movement_sparsity/model_sparsity": 0.6739247890557193, + "compression_loss": 122.20164489746094, + "distillation_loss": 4.842893123626709, + "epoch": 1.38, + "learning_rate": 4.015819345489675e-05, + "loss": 126.8245, + "step": 1630, + "task_loss": 2.280773639678955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1382249472885098, + "compression/movement_sparsity/importance_threshold": -0.0011007409733943792, + "compression/movement_sparsity/linear_layer_sparsity": 0.6993608503057166, + "compression/movement_sparsity/model_sparsity": 0.6753356536120886, + "compression_loss": 122.3591537475586, + "distillation_loss": 5.322578430175781, + "epoch": 1.38, + "learning_rate": 4.0152155536770927e-05, + "loss": 127.1086, + "step": 1631, + "task_loss": 2.8617260456085205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1396955757803091, + "compression/movement_sparsity/importance_threshold": -0.001096266422080152, + "compression/movement_sparsity/linear_layer_sparsity": 0.7008561170789185, + "compression/movement_sparsity/model_sparsity": 0.6767795533716524, + "compression_loss": 122.51630401611328, + "distillation_loss": 3.590932846069336, + "epoch": 1.38, + "learning_rate": 4.0146117618645094e-05, + "loss": 126.9574, + "step": 1632, + "task_loss": 2.191063642501831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1411622134238373, + "compression/movement_sparsity/importance_threshold": -0.001091804013367185, + "compression/movement_sparsity/linear_layer_sparsity": 0.702183968538513, + "compression/movement_sparsity/model_sparsity": 0.6780617890486609, + "compression_loss": 122.67301177978516, + "distillation_loss": 5.580835342407227, + "epoch": 1.38, + "learning_rate": 4.014007970051926e-05, + "loss": 127.1921, + "step": 1633, + "task_loss": 2.5486977100372314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1426248656414415, + "compression/movement_sparsity/importance_threshold": -0.00108735373075738, + "compression/movement_sparsity/linear_layer_sparsity": 0.703367227541355, + "compression/movement_sparsity/model_sparsity": 0.6792043994646247, + "compression_loss": 122.82927703857422, + "distillation_loss": 6.5097198486328125, + "epoch": 1.38, + "learning_rate": 4.0134041782393435e-05, + "loss": 127.2932, + "step": 1634, + "task_loss": 3.167710065841675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1440835378554708, + "compression/movement_sparsity/importance_threshold": -0.0010829155577526356, + "compression/movement_sparsity/linear_layer_sparsity": 0.7048051867648989, + "compression/movement_sparsity/model_sparsity": 0.68059296036516, + "compression_loss": 122.98515319824219, + "distillation_loss": 4.602743148803711, + "epoch": 1.38, + "learning_rate": 4.01280038642676e-05, + "loss": 127.9748, + "step": 1635, + "task_loss": 2.8524296283721924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1455382354882733, + "compression/movement_sparsity/importance_threshold": -0.0010784894778548535, + "compression/movement_sparsity/linear_layer_sparsity": 0.7059523274639719, + "compression/movement_sparsity/model_sparsity": 0.6817006932522022, + "compression_loss": 123.14057922363281, + "distillation_loss": 4.733675003051758, + "epoch": 1.38, + "learning_rate": 4.012196594614177e-05, + "loss": 127.5029, + "step": 1636, + "task_loss": 2.788015842437744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1469889639621966, + "compression/movement_sparsity/importance_threshold": -0.0010740754745659346, + "compression/movement_sparsity/linear_layer_sparsity": 0.7072163727025468, + "compression/movement_sparsity/model_sparsity": 0.6829213146481741, + "compression_loss": 123.29558563232422, + "distillation_loss": 3.9853811264038086, + "epoch": 1.38, + "learning_rate": 4.011592802801594e-05, + "loss": 127.6096, + "step": 1637, + "task_loss": 1.706752896308899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1484357286995897, + "compression/movement_sparsity/importance_threshold": -0.0010696735313877777, + "compression/movement_sparsity/linear_layer_sparsity": 0.7083849411308614, + "compression/movement_sparsity/model_sparsity": 0.6840497391560391, + "compression_loss": 123.45013427734375, + "distillation_loss": 3.209531307220459, + "epoch": 1.38, + "learning_rate": 4.010989010989011e-05, + "loss": 126.747, + "step": 1638, + "task_loss": 2.3288533687591553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1498785351228002, + "compression/movement_sparsity/importance_threshold": -0.0010652836318222844, + "compression/movement_sparsity/linear_layer_sparsity": 0.7097099665627261, + "compression/movement_sparsity/model_sparsity": 0.6853292458880643, + "compression_loss": 123.60420227050781, + "distillation_loss": 5.037357807159424, + "epoch": 1.39, + "learning_rate": 4.010385219176428e-05, + "loss": 128.1229, + "step": 1639, + "task_loss": 2.4419302940368652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1513173886541763, + "compression/movement_sparsity/importance_threshold": -0.0010609057593713557, + "compression/movement_sparsity/linear_layer_sparsity": 0.7110772989413628, + "compression/movement_sparsity/model_sparsity": 0.6866496061930885, + "compression_loss": 123.75798034667969, + "distillation_loss": 4.979372024536133, + "epoch": 1.39, + "learning_rate": 4.009781427363845e-05, + "loss": 128.3517, + "step": 1640, + "task_loss": 2.1684041023254395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1527522947160667, + "compression/movement_sparsity/importance_threshold": -0.0010565398975368907, + "compression/movement_sparsity/linear_layer_sparsity": 0.7123028887393121, + "compression/movement_sparsity/model_sparsity": 0.687833093211123, + "compression_loss": 123.91126251220703, + "distillation_loss": 4.463488578796387, + "epoch": 1.39, + "learning_rate": 4.0091776355512625e-05, + "loss": 128.0204, + "step": 1641, + "task_loss": 2.0553979873657227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1541832587308194, + "compression/movement_sparsity/importance_threshold": -0.00105218602982079, + "compression/movement_sparsity/linear_layer_sparsity": 0.7136189114246118, + "compression/movement_sparsity/model_sparsity": 0.6891039064686233, + "compression_loss": 124.06411743164062, + "distillation_loss": 5.677126884460449, + "epoch": 1.39, + "learning_rate": 4.0085738437386786e-05, + "loss": 128.5432, + "step": 1642, + "task_loss": 2.7731428146362305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.155610286120782, + "compression/movement_sparsity/importance_threshold": -0.0010478441397249556, + "compression/movement_sparsity/linear_layer_sparsity": 0.7148010853281989, + "compression/movement_sparsity/model_sparsity": 0.6902454690618298, + "compression_loss": 124.21653747558594, + "distillation_loss": 5.455066680908203, + "epoch": 1.39, + "learning_rate": 4.007970051926096e-05, + "loss": 129.4895, + "step": 1643, + "task_loss": 2.425184488296509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1570333823083034, + "compression/movement_sparsity/importance_threshold": -0.0010435142107512868, + "compression/movement_sparsity/linear_layer_sparsity": 0.7160801430938274, + "compression/movement_sparsity/model_sparsity": 0.691480587258367, + "compression_loss": 124.36854553222656, + "distillation_loss": 4.8237128257751465, + "epoch": 1.39, + "learning_rate": 4.0073662601135134e-05, + "loss": 129.3783, + "step": 1644, + "task_loss": 1.8762375116348267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1584525527157319, + "compression/movement_sparsity/importance_threshold": -0.0010391962264016835, + "compression/movement_sparsity/linear_layer_sparsity": 0.7171808868566292, + "compression/movement_sparsity/model_sparsity": 0.6925435170866326, + "compression_loss": 124.5201187133789, + "distillation_loss": 4.7112345695495605, + "epoch": 1.39, + "learning_rate": 4.00676246830093e-05, + "loss": 128.8596, + "step": 1645, + "task_loss": 2.30698299407959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1598678027654148, + "compression/movement_sparsity/importance_threshold": -0.0010348901701780471, + "compression/movement_sparsity/linear_layer_sparsity": 0.7185453097383627, + "compression/movement_sparsity/model_sparsity": 0.693861067844923, + "compression_loss": 124.67130279541016, + "distillation_loss": 4.574435234069824, + "epoch": 1.39, + "learning_rate": 4.006158676488347e-05, + "loss": 129.5713, + "step": 1646, + "task_loss": 1.7015719413757324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1612791378797014, + "compression/movement_sparsity/importance_threshold": -0.0010305960255822771, + "compression/movement_sparsity/linear_layer_sparsity": 0.7198698224310192, + "compression/movement_sparsity/model_sparsity": 0.695140079451909, + "compression_loss": 124.822021484375, + "distillation_loss": 3.9105381965637207, + "epoch": 1.39, + "learning_rate": 4.005554884675764e-05, + "loss": 129.2591, + "step": 1647, + "task_loss": 3.3302674293518066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.162686563480939, + "compression/movement_sparsity/importance_threshold": -0.001026313776116276, + "compression/movement_sparsity/linear_layer_sparsity": 0.7212080717647921, + "compression/movement_sparsity/model_sparsity": 0.6964323558041303, + "compression_loss": 124.97233581542969, + "distillation_loss": 5.138538837432861, + "epoch": 1.39, + "learning_rate": 4.004951092863181e-05, + "loss": 129.7413, + "step": 1648, + "task_loss": 2.8371078968048096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1640900849914761, + "compression/movement_sparsity/importance_threshold": -0.0010220434052819424, + "compression/movement_sparsity/linear_layer_sparsity": 0.7222304975945615, + "compression/movement_sparsity/model_sparsity": 0.6974196581612975, + "compression_loss": 125.1222915649414, + "distillation_loss": 4.873781204223633, + "epoch": 1.39, + "learning_rate": 4.0043473010505977e-05, + "loss": 129.6072, + "step": 1649, + "task_loss": 3.3788414001464844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1654897078336612, + "compression/movement_sparsity/importance_threshold": -0.0010177848965811768, + "compression/movement_sparsity/linear_layer_sparsity": 0.7234908701895046, + "compression/movement_sparsity/model_sparsity": 0.6986367330802447, + "compression_loss": 125.27172088623047, + "distillation_loss": 3.5707449913024902, + "epoch": 1.39, + "learning_rate": 4.003743509238015e-05, + "loss": 129.4218, + "step": 1650, + "task_loss": 1.7528709173202515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1668854374298419, + "compression/movement_sparsity/importance_threshold": -0.0010135382335158807, + "compression/movement_sparsity/linear_layer_sparsity": 0.7247468308424224, + "compression/movement_sparsity/model_sparsity": 0.6998495476209478, + "compression_loss": 125.4207992553711, + "distillation_loss": 6.146169662475586, + "epoch": 1.4, + "learning_rate": 4.003139717425432e-05, + "loss": 131.0971, + "step": 1651, + "task_loss": 2.7813305854797363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.168277279202367, + "compression/movement_sparsity/importance_threshold": -0.001009303399587954, + "compression/movement_sparsity/linear_layer_sparsity": 0.7260744318944965, + "compression/movement_sparsity/model_sparsity": 0.7011315414927046, + "compression_loss": 125.56947326660156, + "distillation_loss": 4.3812785148620605, + "epoch": 1.4, + "learning_rate": 4.0025359256128485e-05, + "loss": 129.5112, + "step": 1652, + "task_loss": 1.6023848056793213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1696652385735846, + "compression/movement_sparsity/importance_threshold": -0.0010050803782992962, + "compression/movement_sparsity/linear_layer_sparsity": 0.7270859709592143, + "compression/movement_sparsity/model_sparsity": 0.7021083310786913, + "compression_loss": 125.71769714355469, + "distillation_loss": 4.6329240798950195, + "epoch": 1.4, + "learning_rate": 4.001932133800266e-05, + "loss": 130.7312, + "step": 1653, + "task_loss": 2.0416886806488037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1710493209658424, + "compression/movement_sparsity/importance_threshold": -0.00100086915315181, + "compression/movement_sparsity/linear_layer_sparsity": 0.7281695081481177, + "compression/movement_sparsity/model_sparsity": 0.7031546454318054, + "compression_loss": 125.86560821533203, + "distillation_loss": 6.298150539398193, + "epoch": 1.4, + "learning_rate": 4.0013283419876826e-05, + "loss": 130.6777, + "step": 1654, + "task_loss": 3.371328115463257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.172429531801489, + "compression/movement_sparsity/importance_threshold": -0.0009966697076473937, + "compression/movement_sparsity/linear_layer_sparsity": 0.7292097009876645, + "compression/movement_sparsity/model_sparsity": 0.7041591044473063, + "compression_loss": 126.01304626464844, + "distillation_loss": 6.301766395568848, + "epoch": 1.4, + "learning_rate": 4.0007245501751e-05, + "loss": 131.1358, + "step": 1655, + "task_loss": 2.768242120742798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1738058765028727, + "compression/movement_sparsity/importance_threshold": -0.0009924820252879486, + "compression/movement_sparsity/linear_layer_sparsity": 0.730290543314682, + "compression/movement_sparsity/model_sparsity": 0.7052028165153309, + "compression_loss": 126.1600341796875, + "distillation_loss": 5.068981170654297, + "epoch": 1.4, + "learning_rate": 4.000120758362517e-05, + "loss": 131.0489, + "step": 1656, + "task_loss": 2.4785683155059814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1751783604923411, + "compression/movement_sparsity/importance_threshold": -0.0009883060895753762, + "compression/movement_sparsity/linear_layer_sparsity": 0.7314464602011349, + "compression/movement_sparsity/model_sparsity": 0.7063190241007179, + "compression_loss": 126.30661010742188, + "distillation_loss": 4.9326324462890625, + "epoch": 1.4, + "learning_rate": 3.999516966549934e-05, + "loss": 131.1656, + "step": 1657, + "task_loss": 2.522188663482666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.176546989192243, + "compression/movement_sparsity/importance_threshold": -0.0009841418840115753, + "compression/movement_sparsity/linear_layer_sparsity": 0.732678966016313, + "compression/movement_sparsity/model_sparsity": 0.7075091895495133, + "compression_loss": 126.4527359008789, + "distillation_loss": 3.8941092491149902, + "epoch": 1.4, + "learning_rate": 3.998913174737351e-05, + "loss": 131.3093, + "step": 1658, + "task_loss": 1.4228382110595703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1779117680249265, + "compression/movement_sparsity/importance_threshold": -0.0009799893920984467, + "compression/movement_sparsity/linear_layer_sparsity": 0.7334243457352309, + "compression/movement_sparsity/model_sparsity": 0.70822896318203, + "compression_loss": 126.59847259521484, + "distillation_loss": 5.147817611694336, + "epoch": 1.4, + "learning_rate": 3.9983093829247675e-05, + "loss": 131.7854, + "step": 1659, + "task_loss": 2.1113858222961426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1792727024127396, + "compression/movement_sparsity/importance_threshold": -0.0009758485973378919, + "compression/movement_sparsity/linear_layer_sparsity": 0.7345577974898578, + "compression/movement_sparsity/model_sparsity": 0.7093234773819801, + "compression_loss": 126.74385833740234, + "distillation_loss": 5.70904541015625, + "epoch": 1.4, + "learning_rate": 3.997705591112185e-05, + "loss": 132.1562, + "step": 1660, + "task_loss": 1.9991743564605713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1806297977780305, + "compression/movement_sparsity/importance_threshold": -0.0009717194832318103, + "compression/movement_sparsity/linear_layer_sparsity": 0.7356654214973855, + "compression/movement_sparsity/model_sparsity": 0.7103930510973991, + "compression_loss": 126.8887710571289, + "distillation_loss": 4.492949485778809, + "epoch": 1.4, + "learning_rate": 3.9971017992996017e-05, + "loss": 131.4506, + "step": 1661, + "task_loss": 2.9952688217163086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1819830595431478, + "compression/movement_sparsity/importance_threshold": -0.0009676020332821025, + "compression/movement_sparsity/linear_layer_sparsity": 0.7366837215651529, + "compression/movement_sparsity/model_sparsity": 0.7113763694251813, + "compression_loss": 127.03329467773438, + "distillation_loss": 7.356082439422607, + "epoch": 1.4, + "learning_rate": 3.9964980074870184e-05, + "loss": 133.021, + "step": 1662, + "task_loss": 3.834693670272827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.183332493130439, + "compression/movement_sparsity/importance_threshold": -0.0009634962309906696, + "compression/movement_sparsity/linear_layer_sparsity": 0.7376262793200973, + "compression/movement_sparsity/model_sparsity": 0.7122865474215966, + "compression_loss": 127.17744445800781, + "distillation_loss": 5.107526779174805, + "epoch": 1.41, + "learning_rate": 3.995894215674436e-05, + "loss": 132.149, + "step": 1663, + "task_loss": 2.1269724369049072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1846781039622531, + "compression/movement_sparsity/importance_threshold": -0.0009594020598594107, + "compression/movement_sparsity/linear_layer_sparsity": 0.7387805506714165, + "compression/movement_sparsity/model_sparsity": 0.713401166001044, + "compression_loss": 127.3211898803711, + "distillation_loss": 4.4077911376953125, + "epoch": 1.41, + "learning_rate": 3.9952904238618525e-05, + "loss": 132.5378, + "step": 1664, + "task_loss": 2.9023802280426025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1860198974609375, + "compression/movement_sparsity/importance_threshold": -0.0009553195033902284, + "compression/movement_sparsity/linear_layer_sparsity": 0.7400044472375614, + "compression/movement_sparsity/model_sparsity": 0.7145830179549957, + "compression_loss": 127.46450805664062, + "distillation_loss": 6.424215316772461, + "epoch": 1.41, + "learning_rate": 3.99468663204927e-05, + "loss": 133.0279, + "step": 1665, + "task_loss": 3.93650484085083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1873578790488408, + "compression/movement_sparsity/importance_threshold": -0.0009512485450850218, + "compression/movement_sparsity/linear_layer_sparsity": 0.7409059500833356, + "compression/movement_sparsity/model_sparsity": 0.7154535514046703, + "compression_loss": 127.60740661621094, + "distillation_loss": 4.932514190673828, + "epoch": 1.41, + "learning_rate": 3.9940828402366866e-05, + "loss": 132.3366, + "step": 1666, + "task_loss": 3.054093837738037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1886920541483113, + "compression/movement_sparsity/importance_threshold": -0.0009471891684456907, + "compression/movement_sparsity/linear_layer_sparsity": 0.741904146004469, + "compression/movement_sparsity/model_sparsity": 0.7164174562251029, + "compression_loss": 127.74990844726562, + "distillation_loss": 4.859705924987793, + "epoch": 1.41, + "learning_rate": 3.993479048424103e-05, + "loss": 132.7677, + "step": 1667, + "task_loss": 2.708770513534546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1900224281816971, + "compression/movement_sparsity/importance_threshold": -0.0009431413569741371, + "compression/movement_sparsity/linear_layer_sparsity": 0.7430379316357898, + "compression/movement_sparsity/model_sparsity": 0.7175122928320552, + "compression_loss": 127.89200592041016, + "distillation_loss": 4.668491363525391, + "epoch": 1.41, + "learning_rate": 3.992875256611521e-05, + "loss": 132.4724, + "step": 1668, + "task_loss": 1.7856543064117432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1913490065713461, + "compression/movement_sparsity/importance_threshold": -0.0009391050941722602, + "compression/movement_sparsity/linear_layer_sparsity": 0.744100541910492, + "compression/movement_sparsity/model_sparsity": 0.71853839917485, + "compression_loss": 128.03366088867188, + "distillation_loss": 4.772624969482422, + "epoch": 1.41, + "learning_rate": 3.9922714647989374e-05, + "loss": 132.6452, + "step": 1669, + "task_loss": 2.0295071601867676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.192671794739607, + "compression/movement_sparsity/importance_threshold": -0.0009350803635419609, + "compression/movement_sparsity/linear_layer_sparsity": 0.7450633230537469, + "compression/movement_sparsity/model_sparsity": 0.7194681058239728, + "compression_loss": 128.17494201660156, + "distillation_loss": 5.829484462738037, + "epoch": 1.41, + "learning_rate": 3.991667672986354e-05, + "loss": 133.6759, + "step": 1670, + "task_loss": 2.7783358097076416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1939907981088276, + "compression/movement_sparsity/importance_threshold": -0.0009310671485851406, + "compression/movement_sparsity/linear_layer_sparsity": 0.7462112507478837, + "compression/movement_sparsity/model_sparsity": 0.7205765986703774, + "compression_loss": 128.3157501220703, + "distillation_loss": 4.5924072265625, + "epoch": 1.41, + "learning_rate": 3.9910638811737716e-05, + "loss": 132.5307, + "step": 1671, + "task_loss": 2.5231821537017822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1953060221013563, + "compression/movement_sparsity/importance_threshold": -0.0009270654328036982, + "compression/movement_sparsity/linear_layer_sparsity": 0.7470638048855127, + "compression/movement_sparsity/model_sparsity": 0.7213998649506155, + "compression_loss": 128.45616149902344, + "distillation_loss": 5.341917037963867, + "epoch": 1.41, + "learning_rate": 3.990460089361188e-05, + "loss": 133.514, + "step": 1672, + "task_loss": 2.1828250885009766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1966174721395413, + "compression/movement_sparsity/importance_threshold": -0.0009230751996995349, + "compression/movement_sparsity/linear_layer_sparsity": 0.748172358978116, + "compression/movement_sparsity/model_sparsity": 0.7224703367998265, + "compression_loss": 128.59616088867188, + "distillation_loss": 6.447425842285156, + "epoch": 1.41, + "learning_rate": 3.989856297548606e-05, + "loss": 134.2175, + "step": 1673, + "task_loss": 3.79872727394104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1979251536457305, + "compression/movement_sparsity/importance_threshold": -0.0009190964327745518, + "compression/movement_sparsity/linear_layer_sparsity": 0.7492809488432222, + "compression/movement_sparsity/model_sparsity": 0.723540843192645, + "compression_loss": 128.7356719970703, + "distillation_loss": 3.91593074798584, + "epoch": 1.41, + "learning_rate": 3.9892525057360224e-05, + "loss": 132.9877, + "step": 1674, + "task_loss": 1.5377082824707031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.1992290720422725, + "compression/movement_sparsity/importance_threshold": -0.0009151291155306482, + "compression/movement_sparsity/linear_layer_sparsity": 0.7503076912216758, + "compression/movement_sparsity/model_sparsity": 0.7245323138117697, + "compression_loss": 128.87486267089844, + "distillation_loss": 5.185428142547607, + "epoch": 1.42, + "learning_rate": 3.98864871392344e-05, + "loss": 133.8421, + "step": 1675, + "task_loss": 2.693092107772827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2005292327515154, + "compression/movement_sparsity/importance_threshold": -0.0009111732314697248, + "compression/movement_sparsity/linear_layer_sparsity": 0.7512944160935435, + "compression/movement_sparsity/model_sparsity": 0.725485141648768, + "compression_loss": 129.0135498046875, + "distillation_loss": 4.279391765594482, + "epoch": 1.42, + "learning_rate": 3.9880449221108565e-05, + "loss": 133.8896, + "step": 1676, + "task_loss": 2.457597255706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2018256411958068, + "compression/movement_sparsity/importance_threshold": -0.0009072287640936832, + "compression/movement_sparsity/linear_layer_sparsity": 0.7524503806766669, + "compression/movement_sparsity/model_sparsity": 0.7266013952922982, + "compression_loss": 129.1519317626953, + "distillation_loss": 6.514693737030029, + "epoch": 1.42, + "learning_rate": 3.987441130298273e-05, + "loss": 134.5309, + "step": 1677, + "task_loss": 2.244901657104492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2031183027974959, + "compression/movement_sparsity/importance_threshold": -0.0009032956969044219, + "compression/movement_sparsity/linear_layer_sparsity": 0.7535459016540442, + "compression/movement_sparsity/model_sparsity": 0.7276592817538858, + "compression_loss": 129.28981018066406, + "distillation_loss": 7.346892356872559, + "epoch": 1.42, + "learning_rate": 3.9868373384856906e-05, + "loss": 134.6019, + "step": 1678, + "task_loss": 3.440413236618042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2044072229789304, + "compression/movement_sparsity/importance_threshold": -0.0008993740134038428, + "compression/movement_sparsity/linear_layer_sparsity": 0.7546157618226691, + "compression/movement_sparsity/model_sparsity": 0.7286923889344435, + "compression_loss": 129.4272918701172, + "distillation_loss": 3.9865946769714355, + "epoch": 1.42, + "learning_rate": 3.986233546673107e-05, + "loss": 134.5362, + "step": 1679, + "task_loss": 2.1240487098693848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2056924071624582, + "compression/movement_sparsity/importance_threshold": -0.0008954636970938466, + "compression/movement_sparsity/linear_layer_sparsity": 0.7556085322475282, + "compression/movement_sparsity/model_sparsity": 0.7296510546410897, + "compression_loss": 129.5644073486328, + "distillation_loss": 4.212987899780273, + "epoch": 1.42, + "learning_rate": 3.985629754860524e-05, + "loss": 133.9234, + "step": 1680, + "task_loss": 2.631201982498169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.206973860770428, + "compression/movement_sparsity/importance_threshold": -0.0008915647314763327, + "compression/movement_sparsity/linear_layer_sparsity": 0.7564401475467886, + "compression/movement_sparsity/model_sparsity": 0.7304541013964725, + "compression_loss": 129.7010955810547, + "distillation_loss": 4.671471118927002, + "epoch": 1.42, + "learning_rate": 3.9850259630479414e-05, + "loss": 134.717, + "step": 1681, + "task_loss": 2.4621150493621826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2082515892251877, + "compression/movement_sparsity/importance_threshold": -0.0008876771000532017, + "compression/movement_sparsity/linear_layer_sparsity": 0.7574919545256127, + "compression/movement_sparsity/model_sparsity": 0.7314697755698374, + "compression_loss": 129.83741760253906, + "distillation_loss": 6.7365570068359375, + "epoch": 1.42, + "learning_rate": 3.984422171235358e-05, + "loss": 136.1288, + "step": 1682, + "task_loss": 2.460587978363037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2095255979490855, + "compression/movement_sparsity/importance_threshold": -0.0008838007863263553, + "compression/movement_sparsity/linear_layer_sparsity": 0.7585072616273035, + "compression/movement_sparsity/model_sparsity": 0.7324502037491352, + "compression_loss": 129.97329711914062, + "distillation_loss": 7.12745475769043, + "epoch": 1.42, + "learning_rate": 3.983818379422775e-05, + "loss": 135.8651, + "step": 1683, + "task_loss": 3.2907121181488037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2107958923644702, + "compression/movement_sparsity/importance_threshold": -0.0008799357737976918, + "compression/movement_sparsity/linear_layer_sparsity": 0.7595094282962597, + "compression/movement_sparsity/model_sparsity": 0.7334179429099873, + "compression_loss": 130.10877990722656, + "distillation_loss": 5.668455123901367, + "epoch": 1.42, + "learning_rate": 3.983214587610192e-05, + "loss": 134.9896, + "step": 1684, + "task_loss": 3.0703577995300293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2120624778936888, + "compression/movement_sparsity/importance_threshold": -0.0008760820459691137, + "compression/movement_sparsity/linear_layer_sparsity": 0.7604597963810056, + "compression/movement_sparsity/model_sparsity": 0.7343356629273481, + "compression_loss": 130.24391174316406, + "distillation_loss": 6.114388942718506, + "epoch": 1.42, + "learning_rate": 3.98261079579761e-05, + "loss": 135.7736, + "step": 1685, + "task_loss": 2.9886763095855713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2133253599590907, + "compression/movement_sparsity/importance_threshold": -0.0008722395863425205, + "compression/movement_sparsity/linear_layer_sparsity": 0.7614440052535021, + "compression/movement_sparsity/model_sparsity": 0.7352860611972937, + "compression_loss": 130.37860107421875, + "distillation_loss": 6.420416831970215, + "epoch": 1.42, + "learning_rate": 3.982007003985026e-05, + "loss": 136.2085, + "step": 1686, + "task_loss": 4.890620708465576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2145845439830232, + "compression/movement_sparsity/importance_threshold": -0.000868408378419813, + "compression/movement_sparsity/linear_layer_sparsity": 0.762181193069254, + "compression/movement_sparsity/model_sparsity": 0.7359979243437196, + "compression_loss": 130.51290893554688, + "distillation_loss": 4.34592342376709, + "epoch": 1.43, + "learning_rate": 3.981403212172443e-05, + "loss": 136.002, + "step": 1687, + "task_loss": 2.0691323280334473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.215840035387835, + "compression/movement_sparsity/importance_threshold": -0.0008645884057028913, + "compression/movement_sparsity/linear_layer_sparsity": 0.7632171051326196, + "compression/movement_sparsity/model_sparsity": 0.7369982496408702, + "compression_loss": 130.64688110351562, + "distillation_loss": 4.932995796203613, + "epoch": 1.43, + "learning_rate": 3.9807994203598605e-05, + "loss": 136.405, + "step": 1688, + "task_loss": 2.7900893688201904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.217091839595874, + "compression/movement_sparsity/importance_threshold": -0.000860779651693656, + "compression/movement_sparsity/linear_layer_sparsity": 0.7642276306430885, + "compression/movement_sparsity/model_sparsity": 0.7379740604913144, + "compression_loss": 130.7803955078125, + "distillation_loss": 4.772183418273926, + "epoch": 1.43, + "learning_rate": 3.980195628547277e-05, + "loss": 135.5182, + "step": 1689, + "task_loss": 1.9071472883224487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2183399620294888, + "compression/movement_sparsity/importance_threshold": -0.000856982099894008, + "compression/movement_sparsity/linear_layer_sparsity": 0.7652892750602123, + "compression/movement_sparsity/model_sparsity": 0.7389992341567098, + "compression_loss": 130.91357421875, + "distillation_loss": 3.9670186042785645, + "epoch": 1.43, + "learning_rate": 3.979591836734694e-05, + "loss": 136.0866, + "step": 1690, + "task_loss": 2.5799996852874756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.219584408111027, + "compression/movement_sparsity/importance_threshold": -0.0008531957338058474, + "compression/movement_sparsity/linear_layer_sparsity": 0.766157426009109, + "compression/movement_sparsity/model_sparsity": 0.7398375614497671, + "compression_loss": 131.04637145996094, + "distillation_loss": 6.1601104736328125, + "epoch": 1.43, + "learning_rate": 3.9789880449221113e-05, + "loss": 136.4336, + "step": 1691, + "task_loss": 3.499772310256958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2208251832628374, + "compression/movement_sparsity/importance_threshold": -0.0008494205369310743, + "compression/movement_sparsity/linear_layer_sparsity": 0.7672088514145687, + "compression/movement_sparsity/model_sparsity": 0.7408528671579865, + "compression_loss": 131.17872619628906, + "distillation_loss": 5.559735298156738, + "epoch": 1.43, + "learning_rate": 3.978384253109528e-05, + "loss": 135.6633, + "step": 1692, + "task_loss": 2.7942800521850586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2220622929072678, + "compression/movement_sparsity/importance_threshold": -0.0008456564927715898, + "compression/movement_sparsity/linear_layer_sparsity": 0.7682637944494812, + "compression/movement_sparsity/model_sparsity": 0.7418715696542654, + "compression_loss": 131.31069946289062, + "distillation_loss": 9.711210250854492, + "epoch": 1.43, + "learning_rate": 3.977780461296945e-05, + "loss": 137.2612, + "step": 1693, + "task_loss": 4.893489360809326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2232957424666666, + "compression/movement_sparsity/importance_threshold": -0.0008419035848292937, + "compression/movement_sparsity/linear_layer_sparsity": 0.7690264045906329, + "compression/movement_sparsity/model_sparsity": 0.7426079817910052, + "compression_loss": 131.44224548339844, + "distillation_loss": 6.687668323516846, + "epoch": 1.43, + "learning_rate": 3.977176669484362e-05, + "loss": 136.9284, + "step": 1694, + "task_loss": 1.8430663347244263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2245255373633819, + "compression/movement_sparsity/importance_threshold": -0.0008381617966060871, + "compression/movement_sparsity/linear_layer_sparsity": 0.7700698885004472, + "compression/movement_sparsity/model_sparsity": 0.7436156188183854, + "compression_loss": 131.57342529296875, + "distillation_loss": 4.849740982055664, + "epoch": 1.43, + "learning_rate": 3.976572877671779e-05, + "loss": 137.1065, + "step": 1695, + "task_loss": 2.328639030456543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.225751683019762, + "compression/movement_sparsity/importance_threshold": -0.0008344311116038699, + "compression/movement_sparsity/linear_layer_sparsity": 0.7709610530928811, + "compression/movement_sparsity/model_sparsity": 0.7444761691655263, + "compression_loss": 131.70416259765625, + "distillation_loss": 6.2563958168029785, + "epoch": 1.43, + "learning_rate": 3.9759690858591956e-05, + "loss": 136.7041, + "step": 1696, + "task_loss": 3.334831953048706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2269741848581548, + "compression/movement_sparsity/importance_threshold": -0.0008307115133245435, + "compression/movement_sparsity/linear_layer_sparsity": 0.7718619597302734, + "compression/movement_sparsity/model_sparsity": 0.7453461268884112, + "compression_loss": 131.8345184326172, + "distillation_loss": 5.515175819396973, + "epoch": 1.43, + "learning_rate": 3.975365294046613e-05, + "loss": 136.3162, + "step": 1697, + "task_loss": 3.497478485107422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.228193048300909, + "compression/movement_sparsity/importance_threshold": -0.0008270029852700065, + "compression/movement_sparsity/linear_layer_sparsity": 0.7728944972541981, + "compression/movement_sparsity/model_sparsity": 0.7463431935719321, + "compression_loss": 131.9645233154297, + "distillation_loss": 5.253702163696289, + "epoch": 1.44, + "learning_rate": 3.9747615022340304e-05, + "loss": 137.0365, + "step": 1698, + "task_loss": 3.289030075073242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2294082787703722, + "compression/movement_sparsity/importance_threshold": -0.0008233055109421617, + "compression/movement_sparsity/linear_layer_sparsity": 0.7737714720871453, + "compression/movement_sparsity/model_sparsity": 0.7471900416214774, + "compression_loss": 132.0940704345703, + "distillation_loss": 5.983340740203857, + "epoch": 1.44, + "learning_rate": 3.9741577104214464e-05, + "loss": 137.0967, + "step": 1699, + "task_loss": 3.4343206882476807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.230619881688893, + "compression/movement_sparsity/importance_threshold": -0.0008196190738429085, + "compression/movement_sparsity/linear_layer_sparsity": 0.7746836232146183, + "compression/movement_sparsity/model_sparsity": 0.7480708575516166, + "compression_loss": 132.2232208251953, + "distillation_loss": 8.211292266845703, + "epoch": 1.44, + "learning_rate": 3.973553918608864e-05, + "loss": 138.5958, + "step": 1700, + "task_loss": 3.1914620399475098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2318278624788197, + "compression/movement_sparsity/importance_threshold": -0.0008159436574741458, + "compression/movement_sparsity/linear_layer_sparsity": 0.7755221783794427, + "compression/movement_sparsity/model_sparsity": 0.7488806057668319, + "compression_loss": 132.35198974609375, + "distillation_loss": 4.689528465270996, + "epoch": 1.44, + "learning_rate": 3.972950126796281e-05, + "loss": 137.5942, + "step": 1701, + "task_loss": 2.8297510147094727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2330322265625, + "compression/movement_sparsity/importance_threshold": -0.000812279245337777, + "compression/movement_sparsity/linear_layer_sparsity": 0.7763946339528558, + "compression/movement_sparsity/model_sparsity": 0.7497230898073111, + "compression_loss": 132.48025512695312, + "distillation_loss": 4.332784175872803, + "epoch": 1.44, + "learning_rate": 3.972346334983697e-05, + "loss": 137.2358, + "step": 1702, + "task_loss": 1.761114239692688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2342329793622824, + "compression/movement_sparsity/importance_threshold": -0.0008086258209357008, + "compression/movement_sparsity/linear_layer_sparsity": 0.7773658693309645, + "compression/movement_sparsity/model_sparsity": 0.7506609602623122, + "compression_loss": 132.60812377929688, + "distillation_loss": 6.659272193908691, + "epoch": 1.44, + "learning_rate": 3.971742543171115e-05, + "loss": 138.7303, + "step": 1703, + "task_loss": 3.077975034713745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2354301263005154, + "compression/movement_sparsity/importance_threshold": -0.0008049833677698172, + "compression/movement_sparsity/linear_layer_sparsity": 0.7781342507692519, + "compression/movement_sparsity/model_sparsity": 0.7514029454343766, + "compression_loss": 132.73570251464844, + "distillation_loss": 4.140135765075684, + "epoch": 1.44, + "learning_rate": 3.971138751358532e-05, + "loss": 137.6002, + "step": 1704, + "task_loss": 2.381814479827881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2366236727995465, + "compression/movement_sparsity/importance_threshold": -0.0008013518693420285, + "compression/movement_sparsity/linear_layer_sparsity": 0.7790292907161674, + "compression/movement_sparsity/model_sparsity": 0.7522672380056507, + "compression_loss": 132.86288452148438, + "distillation_loss": 6.575408935546875, + "epoch": 1.44, + "learning_rate": 3.970534959545949e-05, + "loss": 138.1842, + "step": 1705, + "task_loss": 4.056782245635986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2378136242817244, + "compression/movement_sparsity/importance_threshold": -0.0007977313091542338, + "compression/movement_sparsity/linear_layer_sparsity": 0.7799712164902272, + "compression/movement_sparsity/model_sparsity": 0.7531768057316688, + "compression_loss": 132.98959350585938, + "distillation_loss": 4.539612770080566, + "epoch": 1.44, + "learning_rate": 3.9699311677333655e-05, + "loss": 137.9326, + "step": 1706, + "task_loss": 2.8210482597351074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2389999861693972, + "compression/movement_sparsity/importance_threshold": -0.0007941216707083336, + "compression/movement_sparsity/linear_layer_sparsity": 0.7807475513483276, + "compression/movement_sparsity/model_sparsity": 0.7539264710991082, + "compression_loss": 133.11599731445312, + "distillation_loss": 6.816411972045898, + "epoch": 1.44, + "learning_rate": 3.969327375920783e-05, + "loss": 137.8192, + "step": 1707, + "task_loss": 3.2334794998168945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.240182763884913, + "compression/movement_sparsity/importance_threshold": -0.000790522937506229, + "compression/movement_sparsity/linear_layer_sparsity": 0.7816695876107708, + "compression/movement_sparsity/model_sparsity": 0.754816832579421, + "compression_loss": 133.24200439453125, + "distillation_loss": 6.478850364685059, + "epoch": 1.44, + "learning_rate": 3.9687235841081996e-05, + "loss": 138.5162, + "step": 1708, + "task_loss": 2.6535329818725586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.24136196285062, + "compression/movement_sparsity/importance_threshold": -0.0007869350930498195, + "compression/movement_sparsity/linear_layer_sparsity": 0.782560346781505, + "compression/movement_sparsity/model_sparsity": 0.7556769914323448, + "compression_loss": 133.36756896972656, + "distillation_loss": 5.8360419273376465, + "epoch": 1.44, + "learning_rate": 3.968119792295616e-05, + "loss": 138.6459, + "step": 1709, + "task_loss": 2.3818163871765137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2425375884888668, + "compression/movement_sparsity/importance_threshold": -0.0007833581208410065, + "compression/movement_sparsity/linear_layer_sparsity": 0.7835102736720484, + "compression/movement_sparsity/model_sparsity": 0.7565942854118811, + "compression_loss": 133.49282836914062, + "distillation_loss": 5.689027309417725, + "epoch": 1.45, + "learning_rate": 3.967516000483034e-05, + "loss": 139.0112, + "step": 1710, + "task_loss": 2.6477861404418945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2437096462220008, + "compression/movement_sparsity/importance_threshold": -0.0007797920043816904, + "compression/movement_sparsity/linear_layer_sparsity": 0.7845658725361808, + "compression/movement_sparsity/model_sparsity": 0.7576136212076287, + "compression_loss": 133.61770629882812, + "distillation_loss": 6.724252700805664, + "epoch": 1.45, + "learning_rate": 3.9669122086704504e-05, + "loss": 139.1362, + "step": 1711, + "task_loss": 3.3332371711730957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2448781414723709, + "compression/movement_sparsity/importance_threshold": -0.0007762367271737704, + "compression/movement_sparsity/linear_layer_sparsity": 0.7854554631384867, + "compression/movement_sparsity/model_sparsity": 0.7584726516360445, + "compression_loss": 133.74209594726562, + "distillation_loss": 6.169949531555176, + "epoch": 1.45, + "learning_rate": 3.966308416857867e-05, + "loss": 138.9246, + "step": 1712, + "task_loss": 3.340078592300415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.246043079662325, + "compression/movement_sparsity/importance_threshold": -0.0007726922727191486, + "compression/movement_sparsity/linear_layer_sparsity": 0.7863619622105003, + "compression/movement_sparsity/model_sparsity": 0.7593480096762171, + "compression_loss": 133.86624145507812, + "distillation_loss": 7.8686299324035645, + "epoch": 1.45, + "learning_rate": 3.9657046250452846e-05, + "loss": 139.8936, + "step": 1713, + "task_loss": 4.0748090744018555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.247204466214211, + "compression/movement_sparsity/importance_threshold": -0.0007691586245197253, + "compression/movement_sparsity/linear_layer_sparsity": 0.7873103508834186, + "compression/movement_sparsity/model_sparsity": 0.7602638182806359, + "compression_loss": 133.9898223876953, + "distillation_loss": 6.3871893882751465, + "epoch": 1.45, + "learning_rate": 3.965100833232702e-05, + "loss": 139.7479, + "step": 1714, + "task_loss": 3.305750608444214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2483623065503777, + "compression/movement_sparsity/importance_threshold": -0.0007656357660773996, + "compression/movement_sparsity/linear_layer_sparsity": 0.7882580598787817, + "compression/movement_sparsity/model_sparsity": 0.7611789705565144, + "compression_loss": 134.11309814453125, + "distillation_loss": 5.852474212646484, + "epoch": 1.45, + "learning_rate": 3.964497041420119e-05, + "loss": 139.2029, + "step": 1715, + "task_loss": 2.626415491104126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2495166060931733, + "compression/movement_sparsity/importance_threshold": -0.0007621236808940729, + "compression/movement_sparsity/linear_layer_sparsity": 0.7890454126677778, + "compression/movement_sparsity/model_sparsity": 0.7619392753550279, + "compression_loss": 134.2360076904297, + "distillation_loss": 4.506422996520996, + "epoch": 1.45, + "learning_rate": 3.9638932496075354e-05, + "loss": 140.0402, + "step": 1716, + "task_loss": 1.7811309099197388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2506673702649453, + "compression/movement_sparsity/importance_threshold": -0.0007586223524716461, + "compression/movement_sparsity/linear_layer_sparsity": 0.7899333815832853, + "compression/movement_sparsity/model_sparsity": 0.7627967398065758, + "compression_loss": 134.35855102539062, + "distillation_loss": 5.711544036865234, + "epoch": 1.45, + "learning_rate": 3.963289457794953e-05, + "loss": 140.2148, + "step": 1717, + "task_loss": 3.5613555908203125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2518146044880425, + "compression/movement_sparsity/importance_threshold": -0.0007551317643120182, + "compression/movement_sparsity/linear_layer_sparsity": 0.7907794728220555, + "compression/movement_sparsity/model_sparsity": 0.7636137652084133, + "compression_loss": 134.480712890625, + "distillation_loss": 5.139287948608398, + "epoch": 1.45, + "learning_rate": 3.9626856659823695e-05, + "loss": 140.1447, + "step": 1718, + "task_loss": 3.3396553993225098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2529583141848128, + "compression/movement_sparsity/importance_threshold": -0.0007516518999170912, + "compression/movement_sparsity/linear_layer_sparsity": 0.7914692501472873, + "compression/movement_sparsity/model_sparsity": 0.7642798465605201, + "compression_loss": 134.6024932861328, + "distillation_loss": 7.168874740600586, + "epoch": 1.45, + "learning_rate": 3.962081874169786e-05, + "loss": 141.2132, + "step": 1719, + "task_loss": 3.6089367866516113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2540985047776043, + "compression/movement_sparsity/importance_threshold": -0.0007481827427887654, + "compression/movement_sparsity/linear_layer_sparsity": 0.7924117840538965, + "compression/movement_sparsity/model_sparsity": 0.7651900015278638, + "compression_loss": 134.72389221191406, + "distillation_loss": 5.392602920532227, + "epoch": 1.45, + "learning_rate": 3.9614780823572036e-05, + "loss": 140.2986, + "step": 1720, + "task_loss": 2.903820514678955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.255235181688766, + "compression/movement_sparsity/importance_threshold": -0.0007447242764289396, + "compression/movement_sparsity/linear_layer_sparsity": 0.7931953568817519, + "compression/movement_sparsity/model_sparsity": 0.7659466562185304, + "compression_loss": 134.84487915039062, + "distillation_loss": 5.348937034606934, + "epoch": 1.45, + "learning_rate": 3.9608742905446203e-05, + "loss": 140.2073, + "step": 1721, + "task_loss": 2.669987201690674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2563683503406449, + "compression/movement_sparsity/importance_threshold": -0.0007412764843395168, + "compression/movement_sparsity/linear_layer_sparsity": 0.7940093721095818, + "compression/movement_sparsity/model_sparsity": 0.7667327075190805, + "compression_loss": 134.96548461914062, + "distillation_loss": 4.224820137023926, + "epoch": 1.46, + "learning_rate": 3.960270498732037e-05, + "loss": 140.2545, + "step": 1722, + "task_loss": 2.1910085678100586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.25749801615559, + "compression/movement_sparsity/importance_threshold": -0.0007378393500223957, + "compression/movement_sparsity/linear_layer_sparsity": 0.7949919473711121, + "compression/movement_sparsity/model_sparsity": 0.7676815282976223, + "compression_loss": 135.08578491210938, + "distillation_loss": 7.336986064910889, + "epoch": 1.46, + "learning_rate": 3.9596667069194545e-05, + "loss": 140.5594, + "step": 1723, + "task_loss": 3.3643126487731934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2586241845559494, + "compression/movement_sparsity/importance_threshold": -0.0007344128569794768, + "compression/movement_sparsity/linear_layer_sparsity": 0.7957787397242293, + "compression/movement_sparsity/model_sparsity": 0.7684412919129535, + "compression_loss": 135.20565795898438, + "distillation_loss": 6.3377556800842285, + "epoch": 1.46, + "learning_rate": 3.959062915106871e-05, + "loss": 141.442, + "step": 1724, + "task_loss": 2.8359532356262207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.259746860964071, + "compression/movement_sparsity/importance_threshold": -0.000730996988712661, + "compression/movement_sparsity/linear_layer_sparsity": 0.7964351890009188, + "compression/movement_sparsity/model_sparsity": 0.7690751901375146, + "compression_loss": 135.32522583007812, + "distillation_loss": 6.394156455993652, + "epoch": 1.46, + "learning_rate": 3.9584591232942886e-05, + "loss": 141.6865, + "step": 1725, + "task_loss": 3.4933269023895264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.260866050802303, + "compression/movement_sparsity/importance_threshold": -0.0007275917287238491, + "compression/movement_sparsity/linear_layer_sparsity": 0.7973268305600582, + "compression/movement_sparsity/model_sparsity": 0.7699362010660871, + "compression_loss": 135.44436645507812, + "distillation_loss": 4.957626819610596, + "epoch": 1.46, + "learning_rate": 3.957855331481705e-05, + "loss": 140.783, + "step": 1726, + "task_loss": 3.05924916267395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2619817594929943, + "compression/movement_sparsity/importance_threshold": -0.0007241970605149405, + "compression/movement_sparsity/linear_layer_sparsity": 0.7982064525582204, + "compression/movement_sparsity/model_sparsity": 0.7707856053425788, + "compression_loss": 135.5631866455078, + "distillation_loss": 5.330682754516602, + "epoch": 1.46, + "learning_rate": 3.957251539669122e-05, + "loss": 141.2486, + "step": 1727, + "task_loss": 2.1685941219329834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.263093992458492, + "compression/movement_sparsity/importance_threshold": -0.0007208129675878375, + "compression/movement_sparsity/linear_layer_sparsity": 0.7988592530396135, + "compression/movement_sparsity/model_sparsity": 0.7714159801191868, + "compression_loss": 135.68154907226562, + "distillation_loss": 8.086620330810547, + "epoch": 1.46, + "learning_rate": 3.9566477478565394e-05, + "loss": 142.455, + "step": 1728, + "task_loss": 3.685267210006714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2642027551211452, + "compression/movement_sparsity/importance_threshold": -0.0007174394334444382, + "compression/movement_sparsity/linear_layer_sparsity": 0.7996886742920288, + "compression/movement_sparsity/model_sparsity": 0.7722169081999833, + "compression_loss": 135.79966735839844, + "distillation_loss": 6.91874885559082, + "epoch": 1.46, + "learning_rate": 3.956043956043956e-05, + "loss": 141.5063, + "step": 1729, + "task_loss": 3.3561720848083496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2653080529033018, + "compression/movement_sparsity/importance_threshold": -0.0007140764415866445, + "compression/movement_sparsity/linear_layer_sparsity": 0.8004280919271286, + "compression/movement_sparsity/model_sparsity": 0.7729309245646029, + "compression_loss": 135.91737365722656, + "distillation_loss": 7.179739952087402, + "epoch": 1.46, + "learning_rate": 3.9554401642313735e-05, + "loss": 141.1135, + "step": 1730, + "task_loss": 4.365039348602295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2664098912273096, + "compression/movement_sparsity/importance_threshold": -0.0007107239755163573, + "compression/movement_sparsity/linear_layer_sparsity": 0.8012194870089533, + "compression/movement_sparsity/model_sparsity": 0.7736951327907508, + "compression_loss": 136.03460693359375, + "distillation_loss": 7.279413223266602, + "epoch": 1.46, + "learning_rate": 3.95483637241879e-05, + "loss": 142.204, + "step": 1731, + "task_loss": 3.3209292888641357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2675082755155174, + "compression/movement_sparsity/importance_threshold": -0.0007073820187354762, + "compression/movement_sparsity/linear_layer_sparsity": 0.8021084575545421, + "compression/movement_sparsity/model_sparsity": 0.7745535644633054, + "compression_loss": 136.15151977539062, + "distillation_loss": 5.9051289558410645, + "epoch": 1.46, + "learning_rate": 3.954232580606207e-05, + "loss": 141.9615, + "step": 1732, + "task_loss": 3.937056303024292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.268603211190273, + "compression/movement_sparsity/importance_threshold": -0.0007040505547459015, + "compression/movement_sparsity/linear_layer_sparsity": 0.8028473982229365, + "compression/movement_sparsity/model_sparsity": 0.775267120246493, + "compression_loss": 136.2680206298828, + "distillation_loss": 6.020902633666992, + "epoch": 1.46, + "learning_rate": 3.9536287887936244e-05, + "loss": 141.8759, + "step": 1733, + "task_loss": 2.859494209289551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2696947036739248, + "compression/movement_sparsity/importance_threshold": -0.0007007295670495343, + "compression/movement_sparsity/linear_layer_sparsity": 0.8036293612881612, + "compression/movement_sparsity/model_sparsity": 0.7760222204748275, + "compression_loss": 136.38421630859375, + "distillation_loss": 6.9229888916015625, + "epoch": 1.47, + "learning_rate": 3.953024996981041e-05, + "loss": 142.41, + "step": 1734, + "task_loss": 3.269270896911621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2707827583888207, + "compression/movement_sparsity/importance_threshold": -0.0006974190391482744, + "compression/movement_sparsity/linear_layer_sparsity": 0.8043100523976544, + "compression/movement_sparsity/model_sparsity": 0.7766795277506588, + "compression_loss": 136.49996948242188, + "distillation_loss": 5.054283618927002, + "epoch": 1.47, + "learning_rate": 3.9524212051684585e-05, + "loss": 141.71, + "step": 1735, + "task_loss": 1.6233290433883667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2718673807573095, + "compression/movement_sparsity/importance_threshold": -0.0006941189545440229, + "compression/movement_sparsity/linear_layer_sparsity": 0.8051874207281335, + "compression/movement_sparsity/model_sparsity": 0.7775267557798853, + "compression_loss": 136.61537170410156, + "distillation_loss": 3.486783981323242, + "epoch": 1.47, + "learning_rate": 3.951817413355875e-05, + "loss": 141.9053, + "step": 1736, + "task_loss": 2.760249614715576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2729485762017387, + "compression/movement_sparsity/importance_threshold": -0.0006908292967386801, + "compression/movement_sparsity/linear_layer_sparsity": 0.8060227086710258, + "compression/movement_sparsity/model_sparsity": 0.7783333490122929, + "compression_loss": 136.73033142089844, + "distillation_loss": 6.09597110748291, + "epoch": 1.47, + "learning_rate": 3.951213621543292e-05, + "loss": 142.905, + "step": 1737, + "task_loss": 2.4623823165893555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2740263501444573, + "compression/movement_sparsity/importance_threshold": -0.0006875500492341453, + "compression/movement_sparsity/linear_layer_sparsity": 0.8067632471778834, + "compression/movement_sparsity/model_sparsity": 0.7790484477432771, + "compression_loss": 136.8450469970703, + "distillation_loss": 5.260236740112305, + "epoch": 1.47, + "learning_rate": 3.950609829730709e-05, + "loss": 142.8653, + "step": 1738, + "task_loss": 3.8331313133239746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2751007080078125, + "compression/movement_sparsity/importance_threshold": -0.0006842811955323214, + "compression/movement_sparsity/linear_layer_sparsity": 0.8074705053328693, + "compression/movement_sparsity/model_sparsity": 0.7797314094048586, + "compression_loss": 136.95932006835938, + "distillation_loss": 5.983112335205078, + "epoch": 1.47, + "learning_rate": 3.950006037918126e-05, + "loss": 142.5842, + "step": 1739, + "task_loss": 3.2135629653930664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.276171655214153, + "compression/movement_sparsity/importance_threshold": -0.0006810227191351072, + "compression/movement_sparsity/linear_layer_sparsity": 0.8083913014818783, + "compression/movement_sparsity/model_sparsity": 0.7806205733734489, + "compression_loss": 137.07310485839844, + "distillation_loss": 5.932247638702393, + "epoch": 1.47, + "learning_rate": 3.949402246105543e-05, + "loss": 143.3072, + "step": 1740, + "task_loss": 2.5304110050201416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2772391971858275, + "compression/movement_sparsity/importance_threshold": -0.0006777746035444027, + "compression/movement_sparsity/linear_layer_sparsity": 0.8092920531050914, + "compression/movement_sparsity/model_sparsity": 0.7814903814073685, + "compression_loss": 137.1866912841797, + "distillation_loss": 7.508889198303223, + "epoch": 1.47, + "learning_rate": 3.94879845429296e-05, + "loss": 144.3809, + "step": 1741, + "task_loss": 4.748712539672852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2783033393451833, + "compression/movement_sparsity/importance_threshold": -0.0006745368322621098, + "compression/movement_sparsity/linear_layer_sparsity": 0.8098400640013004, + "compression/movement_sparsity/model_sparsity": 0.782019566443414, + "compression_loss": 137.29981994628906, + "distillation_loss": 5.782407283782959, + "epoch": 1.47, + "learning_rate": 3.948194662480377e-05, + "loss": 143.3534, + "step": 1742, + "task_loss": 4.705403804779053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2793640871145688, + "compression/movement_sparsity/importance_threshold": -0.0006713093887901283, + "compression/movement_sparsity/linear_layer_sparsity": 0.8103145862523695, + "compression/movement_sparsity/model_sparsity": 0.7824777873953578, + "compression_loss": 137.41268920898438, + "distillation_loss": 9.290245056152344, + "epoch": 1.47, + "learning_rate": 3.9475908706677936e-05, + "loss": 144.3383, + "step": 1743, + "task_loss": 4.8011155128479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2804214459163328, + "compression/movement_sparsity/importance_threshold": -0.0006680922566303579, + "compression/movement_sparsity/linear_layer_sparsity": 0.8111636823813841, + "compression/movement_sparsity/model_sparsity": 0.7832977144602155, + "compression_loss": 137.52517700195312, + "distillation_loss": 8.803297996520996, + "epoch": 1.47, + "learning_rate": 3.946987078855211e-05, + "loss": 144.1548, + "step": 1744, + "task_loss": 4.50725793838501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2814754211728228, + "compression/movement_sparsity/importance_threshold": -0.0006648854192847008, + "compression/movement_sparsity/linear_layer_sparsity": 0.8119442741673306, + "compression/movement_sparsity/model_sparsity": 0.7840514905169335, + "compression_loss": 137.63722229003906, + "distillation_loss": 6.513503074645996, + "epoch": 1.47, + "learning_rate": 3.9463832870426284e-05, + "loss": 143.5152, + "step": 1745, + "task_loss": 2.372196674346924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2825260183063873, + "compression/movement_sparsity/importance_threshold": -0.0006616888602550552, + "compression/movement_sparsity/linear_layer_sparsity": 0.8127045471716255, + "compression/movement_sparsity/model_sparsity": 0.7847856458046576, + "compression_loss": 137.74900817871094, + "distillation_loss": 7.200915336608887, + "epoch": 1.48, + "learning_rate": 3.945779495230045e-05, + "loss": 144.3316, + "step": 1746, + "task_loss": 3.886157751083374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2835732427393747, + "compression/movement_sparsity/importance_threshold": -0.0006585025630433234, + "compression/movement_sparsity/linear_layer_sparsity": 0.8133733379618182, + "compression/movement_sparsity/model_sparsity": 0.785431461573766, + "compression_loss": 137.8603973388672, + "distillation_loss": 6.107273101806641, + "epoch": 1.48, + "learning_rate": 3.945175703417462e-05, + "loss": 143.9328, + "step": 1747, + "task_loss": 2.3961503505706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2846170998941326, + "compression/movement_sparsity/importance_threshold": -0.0006553265111514054, + "compression/movement_sparsity/linear_layer_sparsity": 0.8140284755800679, + "compression/movement_sparsity/model_sparsity": 0.7860640931993896, + "compression_loss": 137.9713592529297, + "distillation_loss": 7.003299713134766, + "epoch": 1.48, + "learning_rate": 3.944571911604879e-05, + "loss": 143.8237, + "step": 1748, + "task_loss": 3.2412619590759277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2856575951930098, + "compression/movement_sparsity/importance_threshold": -0.0006521606880812011, + "compression/movement_sparsity/linear_layer_sparsity": 0.8146782830953841, + "compression/movement_sparsity/model_sparsity": 0.7866915778275132, + "compression_loss": 138.08189392089844, + "distillation_loss": 5.127871990203857, + "epoch": 1.48, + "learning_rate": 3.943968119792296e-05, + "loss": 143.6361, + "step": 1749, + "task_loss": 2.6416983604431152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.286694734058354, + "compression/movement_sparsity/importance_threshold": -0.000649005077334612, + "compression/movement_sparsity/linear_layer_sparsity": 0.8154368509437072, + "compression/movement_sparsity/model_sparsity": 0.7874240865366187, + "compression_loss": 138.19223022460938, + "distillation_loss": 5.6010870933532715, + "epoch": 1.48, + "learning_rate": 3.9433643279797126e-05, + "loss": 144.204, + "step": 1750, + "task_loss": 2.7898385524749756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2877285219125139, + "compression/movement_sparsity/importance_threshold": -0.0006458596624135384, + "compression/movement_sparsity/linear_layer_sparsity": 0.8159866743133969, + "compression/movement_sparsity/model_sparsity": 0.787955021782105, + "compression_loss": 138.3020782470703, + "distillation_loss": 7.651647567749023, + "epoch": 1.48, + "learning_rate": 3.94276053616713e-05, + "loss": 143.6304, + "step": 1751, + "task_loss": 3.8118278980255127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2887589641778374, + "compression/movement_sparsity/importance_threshold": -0.0006427244268198788, + "compression/movement_sparsity/linear_layer_sparsity": 0.8167127011082417, + "compression/movement_sparsity/model_sparsity": 0.7886561073230272, + "compression_loss": 138.41151428222656, + "distillation_loss": 5.834653854370117, + "epoch": 1.48, + "learning_rate": 3.942156744354547e-05, + "loss": 144.2587, + "step": 1752, + "task_loss": 2.2168214321136475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2897860662766727, + "compression/movement_sparsity/importance_threshold": -0.000639599354055536, + "compression/movement_sparsity/linear_layer_sparsity": 0.8176160521999994, + "compression/movement_sparsity/model_sparsity": 0.78952842552575, + "compression_loss": 138.52056884765625, + "distillation_loss": 4.8426713943481445, + "epoch": 1.48, + "learning_rate": 3.9415529525419635e-05, + "loss": 144.8146, + "step": 1753, + "task_loss": 2.2209486961364746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2908098336313678, + "compression/movement_sparsity/importance_threshold": -0.0006364844276224102, + "compression/movement_sparsity/linear_layer_sparsity": 0.8184404772261753, + "compression/movement_sparsity/model_sparsity": 0.7903245290160488, + "compression_loss": 138.62942504882812, + "distillation_loss": 6.882920742034912, + "epoch": 1.48, + "learning_rate": 3.940949160729381e-05, + "loss": 144.5015, + "step": 1754, + "task_loss": 4.240734577178955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2918302716642716, + "compression/movement_sparsity/importance_threshold": -0.0006333796310224, + "compression/movement_sparsity/linear_layer_sparsity": 0.8190444482410995, + "compression/movement_sparsity/model_sparsity": 0.790907751768578, + "compression_loss": 138.73785400390625, + "distillation_loss": 5.063625335693359, + "epoch": 1.48, + "learning_rate": 3.940345368916798e-05, + "loss": 144.5788, + "step": 1755, + "task_loss": 2.6990959644317627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2928473857977316, + "compression/movement_sparsity/importance_threshold": -0.0006302849477574082, + "compression/movement_sparsity/linear_layer_sparsity": 0.8197788934982951, + "compression/movement_sparsity/model_sparsity": 0.7916169665717712, + "compression_loss": 138.84597778320312, + "distillation_loss": 4.88409948348999, + "epoch": 1.48, + "learning_rate": 3.939741577104214e-05, + "loss": 144.4637, + "step": 1756, + "task_loss": 3.1373655796051025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2938611814540961, + "compression/movement_sparsity/importance_threshold": -0.0006272003613293341, + "compression/movement_sparsity/linear_layer_sparsity": 0.8204202110062547, + "compression/movement_sparsity/model_sparsity": 0.792236252850409, + "compression_loss": 138.95367431640625, + "distillation_loss": 7.489863395690918, + "epoch": 1.48, + "learning_rate": 3.939137785291632e-05, + "loss": 145.4002, + "step": 1757, + "task_loss": 3.7274980545043945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2948716640557136, + "compression/movement_sparsity/importance_threshold": -0.0006241258552400772, + "compression/movement_sparsity/linear_layer_sparsity": 0.8210846137027574, + "compression/movement_sparsity/model_sparsity": 0.792877831270345, + "compression_loss": 139.06105041503906, + "distillation_loss": 6.363085746765137, + "epoch": 1.49, + "learning_rate": 3.938533993479049e-05, + "loss": 144.8903, + "step": 1758, + "task_loss": 2.6233699321746826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2958788390249318, + "compression/movement_sparsity/importance_threshold": -0.0006210614129915403, + "compression/movement_sparsity/linear_layer_sparsity": 0.8217275886700184, + "compression/movement_sparsity/model_sparsity": 0.7934987180694582, + "compression_loss": 139.16806030273438, + "distillation_loss": 5.925960540771484, + "epoch": 1.49, + "learning_rate": 3.937930201666465e-05, + "loss": 144.6537, + "step": 1759, + "task_loss": 2.42376971244812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2968827117840998, + "compression/movement_sparsity/importance_threshold": -0.0006180070180856218, + "compression/movement_sparsity/linear_layer_sparsity": 0.8223641961317618, + "compression/movement_sparsity/model_sparsity": 0.7941134561064571, + "compression_loss": 139.2747039794922, + "distillation_loss": 6.429130554199219, + "epoch": 1.49, + "learning_rate": 3.9373264098538825e-05, + "loss": 145.0042, + "step": 1760, + "task_loss": 2.613955020904541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2978832877555648, + "compression/movement_sparsity/importance_threshold": -0.000614962654024223, + "compression/movement_sparsity/linear_layer_sparsity": 0.8230937763285622, + "compression/movement_sparsity/model_sparsity": 0.7948179729790461, + "compression_loss": 139.38095092773438, + "distillation_loss": 5.598249435424805, + "epoch": 1.49, + "learning_rate": 3.9367226180413e-05, + "loss": 144.9486, + "step": 1761, + "task_loss": 3.1093711853027344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2988805723616752, + "compression/movement_sparsity/importance_threshold": -0.0006119283043092448, + "compression/movement_sparsity/linear_layer_sparsity": 0.824018113955359, + "compression/movement_sparsity/model_sparsity": 0.7957105567647673, + "compression_loss": 139.4868927001953, + "distillation_loss": 5.053100109100342, + "epoch": 1.49, + "learning_rate": 3.9361188262287166e-05, + "loss": 145.444, + "step": 1762, + "task_loss": 2.607212543487549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.2998745710247797, + "compression/movement_sparsity/importance_threshold": -0.0006089039524425867, + "compression/movement_sparsity/linear_layer_sparsity": 0.8246679453190106, + "compression/movement_sparsity/model_sparsity": 0.7963380644219624, + "compression_loss": 139.5924072265625, + "distillation_loss": 7.497715950012207, + "epoch": 1.49, + "learning_rate": 3.9355150344161334e-05, + "loss": 145.3521, + "step": 1763, + "task_loss": 3.1383824348449707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.300865289167226, + "compression/movement_sparsity/importance_threshold": -0.0006058895819261506, + "compression/movement_sparsity/linear_layer_sparsity": 0.8254046203955542, + "compression/movement_sparsity/model_sparsity": 0.7970494324433491, + "compression_loss": 139.6975555419922, + "distillation_loss": 8.2505521774292, + "epoch": 1.49, + "learning_rate": 3.934911242603551e-05, + "loss": 146.1034, + "step": 1764, + "task_loss": 3.020756244659424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3018527322113624, + "compression/movement_sparsity/importance_threshold": -0.000602885176261835, + "compression/movement_sparsity/linear_layer_sparsity": 0.8261942864730716, + "compression/movement_sparsity/model_sparsity": 0.7978119710618068, + "compression_loss": 139.80259704589844, + "distillation_loss": 3.945004940032959, + "epoch": 1.49, + "learning_rate": 3.9343074507909675e-05, + "loss": 144.9918, + "step": 1765, + "task_loss": 1.8654829263687134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3028369055795375, + "compression/movement_sparsity/importance_threshold": -0.000599890718951542, + "compression/movement_sparsity/linear_layer_sparsity": 0.8267308501683501, + "compression/movement_sparsity/model_sparsity": 0.7983301021434895, + "compression_loss": 139.90704345703125, + "distillation_loss": 6.858841419219971, + "epoch": 1.49, + "learning_rate": 3.933703658978384e-05, + "loss": 146.1691, + "step": 1766, + "task_loss": 2.8752636909484863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.303817814694099, + "compression/movement_sparsity/importance_threshold": -0.0005969061934971709, + "compression/movement_sparsity/linear_layer_sparsity": 0.8274216291236633, + "compression/movement_sparsity/model_sparsity": 0.7989971507166032, + "compression_loss": 140.01119995117188, + "distillation_loss": 5.228715896606445, + "epoch": 1.49, + "learning_rate": 3.9330998671658016e-05, + "loss": 145.1536, + "step": 1767, + "task_loss": 2.606445789337158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3047954649773952, + "compression/movement_sparsity/importance_threshold": -0.0005939315834006227, + "compression/movement_sparsity/linear_layer_sparsity": 0.8281291973070078, + "compression/movement_sparsity/model_sparsity": 0.7996804117561154, + "compression_loss": 140.1150360107422, + "distillation_loss": 7.838719367980957, + "epoch": 1.49, + "learning_rate": 3.932496075353218e-05, + "loss": 146.51, + "step": 1768, + "task_loss": 4.325048923492432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3057698618517743, + "compression/movement_sparsity/importance_threshold": -0.0005909668721637978, + "compression/movement_sparsity/linear_layer_sparsity": 0.8287999794331956, + "compression/movement_sparsity/model_sparsity": 0.8003281504527014, + "compression_loss": 140.2186279296875, + "distillation_loss": 6.408384323120117, + "epoch": 1.5, + "learning_rate": 3.931892283540635e-05, + "loss": 145.9463, + "step": 1769, + "task_loss": 3.1985602378845215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3067410107395847, + "compression/movement_sparsity/importance_threshold": -0.0005880120432885973, + "compression/movement_sparsity/linear_layer_sparsity": 0.8294655864706295, + "compression/movement_sparsity/model_sparsity": 0.8009708918407528, + "compression_loss": 140.32174682617188, + "distillation_loss": 5.4939045906066895, + "epoch": 1.5, + "learning_rate": 3.9312884917280524e-05, + "loss": 147.2387, + "step": 1770, + "task_loss": 3.274066209793091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3077089170631744, + "compression/movement_sparsity/importance_threshold": -0.0005850670802769204, + "compression/movement_sparsity/linear_layer_sparsity": 0.8301473269068748, + "compression/movement_sparsity/model_sparsity": 0.801629212395734, + "compression_loss": 140.4244842529297, + "distillation_loss": 4.6569342613220215, + "epoch": 1.5, + "learning_rate": 3.93068469991547e-05, + "loss": 145.769, + "step": 1771, + "task_loss": 1.4715452194213867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3086735862448915, + "compression/movement_sparsity/importance_threshold": -0.0005821319666306692, + "compression/movement_sparsity/linear_layer_sparsity": 0.8307435949095061, + "compression/movement_sparsity/model_sparsity": 0.80220499675814, + "compression_loss": 140.5270233154297, + "distillation_loss": 5.71185827255249, + "epoch": 1.5, + "learning_rate": 3.930080908102886e-05, + "loss": 145.9759, + "step": 1772, + "task_loss": 3.63094425201416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3096350237070846, + "compression/movement_sparsity/importance_threshold": -0.0005792066858517422, + "compression/movement_sparsity/linear_layer_sparsity": 0.8313881557910626, + "compression/movement_sparsity/model_sparsity": 0.8028274149905139, + "compression_loss": 140.6290740966797, + "distillation_loss": 4.518129348754883, + "epoch": 1.5, + "learning_rate": 3.929477116290303e-05, + "loss": 145.7943, + "step": 1773, + "task_loss": 1.7505050897598267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3105932348721014, + "compression/movement_sparsity/importance_threshold": -0.0005762912214420414, + "compression/movement_sparsity/linear_layer_sparsity": 0.831980166865848, + "compression/movement_sparsity/model_sparsity": 0.8033990886636413, + "compression_loss": 140.73081970214844, + "distillation_loss": 5.6234517097473145, + "epoch": 1.5, + "learning_rate": 3.9288733244777206e-05, + "loss": 147.0208, + "step": 1774, + "task_loss": 4.130677223205566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3115482251622907, + "compression/movement_sparsity/importance_threshold": -0.0005733855569034661, + "compression/movement_sparsity/linear_layer_sparsity": 0.8325881682494329, + "compression/movement_sparsity/model_sparsity": 0.8039862033292692, + "compression_loss": 140.8321533203125, + "distillation_loss": 7.025821685791016, + "epoch": 1.5, + "learning_rate": 3.928269532665137e-05, + "loss": 146.7151, + "step": 1775, + "task_loss": 3.4836831092834473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3125, + "compression/movement_sparsity/importance_threshold": -0.0005704896757379174, + "compression/movement_sparsity/linear_layer_sparsity": 0.833281999791661, + "compression/movement_sparsity/model_sparsity": 0.8046561996235461, + "compression_loss": 140.93325805664062, + "distillation_loss": 5.409782409667969, + "epoch": 1.5, + "learning_rate": 3.927665740852554e-05, + "loss": 147.1021, + "step": 1776, + "task_loss": 1.9679040908813477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.313448564807578, + "compression/movement_sparsity/importance_threshold": -0.0005676035614472956, + "compression/movement_sparsity/linear_layer_sparsity": 0.8339593759825514, + "compression/movement_sparsity/model_sparsity": 0.8053103058584266, + "compression_loss": 141.03379821777344, + "distillation_loss": 7.2750244140625, + "epoch": 1.5, + "learning_rate": 3.9270619490399715e-05, + "loss": 147.2537, + "step": 1777, + "task_loss": 3.176138162612915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3143939250073724, + "compression/movement_sparsity/importance_threshold": -0.0005647271975335027, + "compression/movement_sparsity/linear_layer_sparsity": 0.834428615827358, + "compression/movement_sparsity/model_sparsity": 0.8057634258710134, + "compression_loss": 141.13417053222656, + "distillation_loss": 6.577167510986328, + "epoch": 1.5, + "learning_rate": 3.926458157227388e-05, + "loss": 146.2798, + "step": 1778, + "task_loss": 3.0580403804779053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.315336086021732, + "compression/movement_sparsity/importance_threshold": -0.0005618605674984362, + "compression/movement_sparsity/linear_layer_sparsity": 0.8349578819320433, + "compression/movement_sparsity/model_sparsity": 0.8062745100567899, + "compression_loss": 141.2341766357422, + "distillation_loss": 4.356266975402832, + "epoch": 1.5, + "learning_rate": 3.925854365414805e-05, + "loss": 146.4973, + "step": 1779, + "task_loss": 2.0127203464508057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3162750532730045, + "compression/movement_sparsity/importance_threshold": -0.0005590036548439991, + "compression/movement_sparsity/linear_layer_sparsity": 0.8354120734772933, + "compression/movement_sparsity/model_sparsity": 0.8067130987252039, + "compression_loss": 141.33375549316406, + "distillation_loss": 4.648265838623047, + "epoch": 1.5, + "learning_rate": 3.925250573602222e-05, + "loss": 147.0363, + "step": 1780, + "task_loss": 3.4716920852661133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3172108321835385, + "compression/movement_sparsity/importance_threshold": -0.0005561564430720898, + "compression/movement_sparsity/linear_layer_sparsity": 0.8358745046223798, + "compression/movement_sparsity/model_sparsity": 0.807159643937852, + "compression_loss": 141.43310546875, + "distillation_loss": 7.4663848876953125, + "epoch": 1.51, + "learning_rate": 3.924646781789639e-05, + "loss": 147.5221, + "step": 1781, + "task_loss": 3.9189014434814453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3181434281756819, + "compression/movement_sparsity/importance_threshold": -0.0005533189156846112, + "compression/movement_sparsity/linear_layer_sparsity": 0.8364867271613079, + "compression/movement_sparsity/model_sparsity": 0.8077508347491511, + "compression_loss": 141.53187561035156, + "distillation_loss": 6.658722877502441, + "epoch": 1.51, + "learning_rate": 3.924042989977056e-05, + "loss": 148.423, + "step": 1782, + "task_loss": 3.3712058067321777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.319072846671783, + "compression/movement_sparsity/importance_threshold": -0.0005504910561834617, + "compression/movement_sparsity/linear_layer_sparsity": 0.8370228854348868, + "compression/movement_sparsity/model_sparsity": 0.8082685743366168, + "compression_loss": 141.6305389404297, + "distillation_loss": 7.865726947784424, + "epoch": 1.51, + "learning_rate": 3.923439198164473e-05, + "loss": 148.1572, + "step": 1783, + "task_loss": 3.6238651275634766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3199990930941898, + "compression/movement_sparsity/importance_threshold": -0.0005476728480705425, + "compression/movement_sparsity/linear_layer_sparsity": 0.8376943114661269, + "compression/movement_sparsity/model_sparsity": 0.8089169348181358, + "compression_loss": 141.7288055419922, + "distillation_loss": 6.46535587310791, + "epoch": 1.51, + "learning_rate": 3.92283540635189e-05, + "loss": 147.4594, + "step": 1784, + "task_loss": 3.276215076446533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.320922172865251, + "compression/movement_sparsity/importance_threshold": -0.0005448642748477539, + "compression/movement_sparsity/linear_layer_sparsity": 0.838362434502932, + "compression/movement_sparsity/model_sparsity": 0.8095621057732397, + "compression_loss": 141.8267822265625, + "distillation_loss": 6.641994953155518, + "epoch": 1.51, + "learning_rate": 3.9222316145393066e-05, + "loss": 147.7042, + "step": 1785, + "task_loss": 2.716071844100952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3218420914073143, + "compression/movement_sparsity/importance_threshold": -0.0005420653200169968, + "compression/movement_sparsity/linear_layer_sparsity": 0.8388262250031289, + "compression/movement_sparsity/model_sparsity": 0.8100099636429684, + "compression_loss": 141.9244384765625, + "distillation_loss": 6.2577104568481445, + "epoch": 1.51, + "learning_rate": 3.921627822726724e-05, + "loss": 147.8868, + "step": 1786, + "task_loss": 4.215167999267578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.322758854142728, + "compression/movement_sparsity/importance_threshold": -0.0005392759670801708, + "compression/movement_sparsity/linear_layer_sparsity": 0.8393582932872088, + "compression/movement_sparsity/model_sparsity": 0.8105237537446566, + "compression_loss": 142.0216827392578, + "distillation_loss": 5.036468505859375, + "epoch": 1.51, + "learning_rate": 3.9210240309141414e-05, + "loss": 148.3714, + "step": 1787, + "task_loss": 2.350123405456543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3236724664938404, + "compression/movement_sparsity/importance_threshold": -0.0005364961995391777, + "compression/movement_sparsity/linear_layer_sparsity": 0.8397848088393759, + "compression/movement_sparsity/model_sparsity": 0.8109356171754916, + "compression_loss": 142.11856079101562, + "distillation_loss": 4.747697830200195, + "epoch": 1.51, + "learning_rate": 3.920420239101558e-05, + "loss": 147.5398, + "step": 1788, + "task_loss": 3.692176342010498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3245829338829997, + "compression/movement_sparsity/importance_threshold": -0.0005337260008959161, + "compression/movement_sparsity/linear_layer_sparsity": 0.8403420728896703, + "compression/movement_sparsity/model_sparsity": 0.8114737374913136, + "compression_loss": 142.21507263183594, + "distillation_loss": 6.544585704803467, + "epoch": 1.51, + "learning_rate": 3.919816447288975e-05, + "loss": 148.1249, + "step": 1789, + "task_loss": 3.345541477203369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3254902617325541, + "compression/movement_sparsity/importance_threshold": -0.000530965354652288, + "compression/movement_sparsity/linear_layer_sparsity": 0.8408676901990592, + "compression/movement_sparsity/model_sparsity": 0.8119812982291369, + "compression_loss": 142.31141662597656, + "distillation_loss": 7.286420822143555, + "epoch": 1.51, + "learning_rate": 3.919212655476392e-05, + "loss": 148.3581, + "step": 1790, + "task_loss": 2.778806447982788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3263944554648517, + "compression/movement_sparsity/importance_threshold": -0.0005282142443101935, + "compression/movement_sparsity/linear_layer_sparsity": 0.8413476975672408, + "compression/movement_sparsity/model_sparsity": 0.8124448158675461, + "compression_loss": 142.40724182128906, + "distillation_loss": 4.967457294464111, + "epoch": 1.51, + "learning_rate": 3.918608863663809e-05, + "loss": 147.358, + "step": 1791, + "task_loss": 2.412517786026001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3272955205022408, + "compression/movement_sparsity/importance_threshold": -0.0005254726533715339, + "compression/movement_sparsity/linear_layer_sparsity": 0.841845996608576, + "compression/movement_sparsity/model_sparsity": 0.8129259968038642, + "compression_loss": 142.5028839111328, + "distillation_loss": 4.927112579345703, + "epoch": 1.51, + "learning_rate": 3.9180050718512256e-05, + "loss": 147.8677, + "step": 1792, + "task_loss": 2.2537665367126465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3281934622670695, + "compression/movement_sparsity/importance_threshold": -0.0005227405653382075, + "compression/movement_sparsity/linear_layer_sparsity": 0.8424267989657835, + "compression/movement_sparsity/model_sparsity": 0.8134868468133447, + "compression_loss": 142.5980682373047, + "distillation_loss": 5.980989456176758, + "epoch": 1.52, + "learning_rate": 3.917401280038643e-05, + "loss": 149.0166, + "step": 1793, + "task_loss": 2.145894765853882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.329088286181686, + "compression/movement_sparsity/importance_threshold": -0.0005200179637121155, + "compression/movement_sparsity/linear_layer_sparsity": 0.8431107096005385, + "compression/movement_sparsity/model_sparsity": 0.8141472630138407, + "compression_loss": 142.69297790527344, + "distillation_loss": 4.006131172180176, + "epoch": 1.52, + "learning_rate": 3.91679748822606e-05, + "loss": 147.8136, + "step": 1794, + "task_loss": 1.4242582321166992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3299799976684386, + "compression/movement_sparsity/importance_threshold": -0.00051730483199516, + "compression/movement_sparsity/linear_layer_sparsity": 0.843798626755619, + "compression/movement_sparsity/model_sparsity": 0.8148115480983634, + "compression_loss": 142.78756713867188, + "distillation_loss": 6.70750617980957, + "epoch": 1.52, + "learning_rate": 3.9161936964134765e-05, + "loss": 149.304, + "step": 1795, + "task_loss": 3.9344842433929443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3308686021496752, + "compression/movement_sparsity/importance_threshold": -0.0005146011536892401, + "compression/movement_sparsity/linear_layer_sparsity": 0.8443933446164578, + "compression/movement_sparsity/model_sparsity": 0.8153858355711162, + "compression_loss": 142.88172912597656, + "distillation_loss": 5.885377883911133, + "epoch": 1.52, + "learning_rate": 3.915589904600894e-05, + "loss": 148.9896, + "step": 1796, + "task_loss": 3.4574124813079834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3317541050477444, + "compression/movement_sparsity/importance_threshold": -0.0005119069122962562, + "compression/movement_sparsity/linear_layer_sparsity": 0.8451308066880653, + "compression/movement_sparsity/model_sparsity": 0.8160979635518654, + "compression_loss": 142.9755401611328, + "distillation_loss": 5.861017227172852, + "epoch": 1.52, + "learning_rate": 3.9149861127883106e-05, + "loss": 148.3139, + "step": 1797, + "task_loss": 2.513068914413452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3326365117849943, + "compression/movement_sparsity/importance_threshold": -0.0005092220913181086, + "compression/movement_sparsity/linear_layer_sparsity": 0.8458398176956937, + "compression/movement_sparsity/model_sparsity": 0.8167826178502088, + "compression_loss": 143.06912231445312, + "distillation_loss": 4.897464752197266, + "epoch": 1.52, + "learning_rate": 3.914382320975728e-05, + "loss": 148.4563, + "step": 1798, + "task_loss": 3.0545711517333984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3335158277837729, + "compression/movement_sparsity/importance_threshold": -0.0005065466742566992, + "compression/movement_sparsity/linear_layer_sparsity": 0.8463216613856913, + "compression/movement_sparsity/model_sparsity": 0.8172479087271304, + "compression_loss": 143.16224670410156, + "distillation_loss": 5.793306827545166, + "epoch": 1.52, + "learning_rate": 3.913778529163145e-05, + "loss": 148.3859, + "step": 1799, + "task_loss": 3.607916831970215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3343920584664284, + "compression/movement_sparsity/importance_threshold": -0.0005038806446139264, + "compression/movement_sparsity/linear_layer_sparsity": 0.8469339316212899, + "compression/movement_sparsity/model_sparsity": 0.8178391455965727, + "compression_loss": 143.25506591796875, + "distillation_loss": 8.277213096618652, + "epoch": 1.52, + "learning_rate": 3.9131747373505614e-05, + "loss": 149.3106, + "step": 1800, + "task_loss": 3.839252233505249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3352652092553092, + "compression/movement_sparsity/importance_threshold": -0.0005012239858916924, + "compression/movement_sparsity/linear_layer_sparsity": 0.8476092091586765, + "compression/movement_sparsity/model_sparsity": 0.8184912252731533, + "compression_loss": 143.3475799560547, + "distillation_loss": 7.331027984619141, + "epoch": 1.52, + "learning_rate": 3.912570945537979e-05, + "loss": 149.7581, + "step": 1801, + "task_loss": 3.7918877601623535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3361352855727633, + "compression/movement_sparsity/importance_threshold": -0.0004985766815918972, + "compression/movement_sparsity/linear_layer_sparsity": 0.8482062641563718, + "compression/movement_sparsity/model_sparsity": 0.8190677695949218, + "compression_loss": 143.43975830078125, + "distillation_loss": 6.094710826873779, + "epoch": 1.52, + "learning_rate": 3.9119671537253955e-05, + "loss": 149.996, + "step": 1802, + "task_loss": 3.4455535411834717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3370022928411394, + "compression/movement_sparsity/importance_threshold": -0.0004959387152164403, + "compression/movement_sparsity/linear_layer_sparsity": 0.8489092296110089, + "compression/movement_sparsity/model_sparsity": 0.8197465860236173, + "compression_loss": 143.5316162109375, + "distillation_loss": 4.777237892150879, + "epoch": 1.52, + "learning_rate": 3.911363361912813e-05, + "loss": 150.0481, + "step": 1803, + "task_loss": 3.3300676345825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.337866236482785, + "compression/movement_sparsity/importance_threshold": -0.0004933100702672229, + "compression/movement_sparsity/linear_layer_sparsity": 0.8495149773269107, + "compression/movement_sparsity/model_sparsity": 0.82033152444198, + "compression_loss": 143.6231689453125, + "distillation_loss": 7.864965438842773, + "epoch": 1.52, + "learning_rate": 3.9107595701002296e-05, + "loss": 150.5879, + "step": 1804, + "task_loss": 3.260775089263916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3387271219200483, + "compression/movement_sparsity/importance_threshold": -0.0004906907302461459, + "compression/movement_sparsity/linear_layer_sparsity": 0.8502480513048283, + "compression/movement_sparsity/model_sparsity": 0.8210394150735567, + "compression_loss": 143.71438598632812, + "distillation_loss": 7.671809673309326, + "epoch": 1.53, + "learning_rate": 3.9101557782876464e-05, + "loss": 150.1447, + "step": 1805, + "task_loss": 4.524222373962402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.339584954575278, + "compression/movement_sparsity/importance_threshold": -0.0004880806786551088, + "compression/movement_sparsity/linear_layer_sparsity": 0.8507274505404605, + "compression/movement_sparsity/model_sparsity": 0.8215023454706405, + "compression_loss": 143.80523681640625, + "distillation_loss": 6.320964813232422, + "epoch": 1.53, + "learning_rate": 3.909551986475064e-05, + "loss": 149.961, + "step": 1806, + "task_loss": 3.266688346862793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3404397398708223, + "compression/movement_sparsity/importance_threshold": -0.00048547989899601265, + "compression/movement_sparsity/linear_layer_sparsity": 0.8511533937325811, + "compression/movement_sparsity/model_sparsity": 0.8219136562037572, + "compression_loss": 143.89590454101562, + "distillation_loss": 7.13585090637207, + "epoch": 1.53, + "learning_rate": 3.9089481946624805e-05, + "loss": 150.1457, + "step": 1807, + "task_loss": 4.044832229614258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3412914832290288, + "compression/movement_sparsity/importance_threshold": -0.0004828883747707586, + "compression/movement_sparsity/linear_layer_sparsity": 0.8515032249626822, + "compression/movement_sparsity/model_sparsity": 0.8222514696548975, + "compression_loss": 143.98617553710938, + "distillation_loss": 7.288115501403809, + "epoch": 1.53, + "learning_rate": 3.908344402849898e-05, + "loss": 150.3703, + "step": 1808, + "task_loss": 3.8762247562408447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3421401900722463, + "compression/movement_sparsity/importance_threshold": -0.0004803060894812451, + "compression/movement_sparsity/linear_layer_sparsity": 0.8520252888701155, + "compression/movement_sparsity/model_sparsity": 0.8227555990610541, + "compression_loss": 144.07611083984375, + "distillation_loss": 6.305458068847656, + "epoch": 1.53, + "learning_rate": 3.9077406110373146e-05, + "loss": 150.6307, + "step": 1809, + "task_loss": 3.966034412384033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3429858658228226, + "compression/movement_sparsity/importance_threshold": -0.00047773302662937507, + "compression/movement_sparsity/linear_layer_sparsity": 0.8526517369410331, + "compression/movement_sparsity/model_sparsity": 0.8233605267135561, + "compression_loss": 144.16567993164062, + "distillation_loss": 7.580005645751953, + "epoch": 1.53, + "learning_rate": 3.907136819224731e-05, + "loss": 150.6131, + "step": 1810, + "task_loss": 3.3612935543060303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3438285159031063, + "compression/movement_sparsity/importance_threshold": -0.00047516916971704695, + "compression/movement_sparsity/linear_layer_sparsity": 0.8531029951410448, + "compression/movement_sparsity/model_sparsity": 0.8237962828061647, + "compression_loss": 144.25503540039062, + "distillation_loss": 6.881827354431152, + "epoch": 1.53, + "learning_rate": 3.906533027412149e-05, + "loss": 151.228, + "step": 1811, + "task_loss": 3.7318949699401855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.344668145735445, + "compression/movement_sparsity/importance_threshold": -0.00047261450224616187, + "compression/movement_sparsity/linear_layer_sparsity": 0.8536398688646819, + "compression/movement_sparsity/model_sparsity": 0.8243147132657781, + "compression_loss": 144.34396362304688, + "distillation_loss": 6.9348602294921875, + "epoch": 1.53, + "learning_rate": 3.9059292355995654e-05, + "loss": 150.595, + "step": 1812, + "task_loss": 3.9608113765716553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3455047607421875, + "compression/movement_sparsity/importance_threshold": -0.0004700690077186209, + "compression/movement_sparsity/linear_layer_sparsity": 0.8543050704804162, + "compression/movement_sparsity/model_sparsity": 0.8249570631596124, + "compression_loss": 144.43276977539062, + "distillation_loss": 5.598838806152344, + "epoch": 1.53, + "learning_rate": 3.905325443786982e-05, + "loss": 149.9999, + "step": 1813, + "task_loss": 1.7702279090881348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3463383663456816, + "compression/movement_sparsity/importance_threshold": -0.0004675326696363235, + "compression/movement_sparsity/linear_layer_sparsity": 0.8547549693253172, + "compression/movement_sparsity/model_sparsity": 0.8253915065951404, + "compression_loss": 144.5210723876953, + "distillation_loss": 6.831597805023193, + "epoch": 1.53, + "learning_rate": 3.9047216519743995e-05, + "loss": 150.4648, + "step": 1814, + "task_loss": 3.3680505752563477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3471689679682757, + "compression/movement_sparsity/importance_threshold": -0.00046500547150117155, + "compression/movement_sparsity/linear_layer_sparsity": 0.8554045979781191, + "compression/movement_sparsity/model_sparsity": 0.826018818505227, + "compression_loss": 144.60910034179688, + "distillation_loss": 6.159719467163086, + "epoch": 1.53, + "learning_rate": 3.904117860161816e-05, + "loss": 151.1119, + "step": 1815, + "task_loss": 3.7059710025787354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.347996571032318, + "compression/movement_sparsity/importance_threshold": -0.0004624873968150636, + "compression/movement_sparsity/linear_layer_sparsity": 0.8558579906041375, + "compression/movement_sparsity/model_sparsity": 0.8264566356997428, + "compression_loss": 144.6968536376953, + "distillation_loss": 6.644157409667969, + "epoch": 1.53, + "learning_rate": 3.903514068349233e-05, + "loss": 151.1702, + "step": 1816, + "task_loss": 3.3798046112060547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3488211809601565, + "compression/movement_sparsity/importance_threshold": -0.00045997842907990246, + "compression/movement_sparsity/linear_layer_sparsity": 0.8563611666300356, + "compression/movement_sparsity/model_sparsity": 0.8269425260812008, + "compression_loss": 144.78424072265625, + "distillation_loss": 6.457886695861816, + "epoch": 1.54, + "learning_rate": 3.9029102765366504e-05, + "loss": 151.1055, + "step": 1817, + "task_loss": 3.5257694721221924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3496428031741394, + "compression/movement_sparsity/importance_threshold": -0.0004574785517975858, + "compression/movement_sparsity/linear_layer_sparsity": 0.8567924041525866, + "compression/movement_sparsity/model_sparsity": 0.8273589492682104, + "compression_loss": 144.8712615966797, + "distillation_loss": 8.108641624450684, + "epoch": 1.54, + "learning_rate": 3.902306484724068e-05, + "loss": 151.6141, + "step": 1818, + "task_loss": 3.3345062732696533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3504614430966153, + "compression/movement_sparsity/importance_threshold": -0.00045498774847001645, + "compression/movement_sparsity/linear_layer_sparsity": 0.8573195119829299, + "compression/movement_sparsity/model_sparsity": 0.827867949323008, + "compression_loss": 144.9580841064453, + "distillation_loss": 6.7251667976379395, + "epoch": 1.54, + "learning_rate": 3.9017026929114845e-05, + "loss": 151.8117, + "step": 1819, + "task_loss": 2.868752956390381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3512771061499318, + "compression/movement_sparsity/importance_threshold": -0.00045250600259909377, + "compression/movement_sparsity/linear_layer_sparsity": 0.8578487065426096, + "compression/movement_sparsity/model_sparsity": 0.8283789644215698, + "compression_loss": 145.04441833496094, + "distillation_loss": 6.845142364501953, + "epoch": 1.54, + "learning_rate": 3.901098901098901e-05, + "loss": 151.5003, + "step": 1820, + "task_loss": 3.3127574920654297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3520897977564377, + "compression/movement_sparsity/importance_threshold": -0.000450033297686718, + "compression/movement_sparsity/linear_layer_sparsity": 0.8583046270921667, + "compression/movement_sparsity/model_sparsity": 0.828819222697674, + "compression_loss": 145.13047790527344, + "distillation_loss": 7.886384963989258, + "epoch": 1.54, + "learning_rate": 3.9004951092863186e-05, + "loss": 151.9832, + "step": 1821, + "task_loss": 3.1895766258239746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3528995233384806, + "compression/movement_sparsity/importance_threshold": -0.00044756961723479115, + "compression/movement_sparsity/linear_layer_sparsity": 0.858686248153185, + "compression/movement_sparsity/model_sparsity": 0.8291877339012426, + "compression_loss": 145.2162322998047, + "distillation_loss": 5.889693260192871, + "epoch": 1.54, + "learning_rate": 3.899891317473735e-05, + "loss": 151.2134, + "step": 1822, + "task_loss": 3.0155718326568604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3537062883184092, + "compression/movement_sparsity/importance_threshold": -0.0004451149447452108, + "compression/movement_sparsity/linear_layer_sparsity": 0.8593464058460092, + "compression/movement_sparsity/model_sparsity": 0.8298252131464356, + "compression_loss": 145.30152893066406, + "distillation_loss": 7.186237335205078, + "epoch": 1.54, + "learning_rate": 3.899287525661152e-05, + "loss": 152.0894, + "step": 1823, + "task_loss": 2.638792037963867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3545100981185716, + "compression/movement_sparsity/importance_threshold": -0.00044266926371987984, + "compression/movement_sparsity/linear_layer_sparsity": 0.8597539381233001, + "compression/movement_sparsity/model_sparsity": 0.8302187454362857, + "compression_loss": 145.3865966796875, + "distillation_loss": 7.422829627990723, + "epoch": 1.54, + "learning_rate": 3.8986837338485694e-05, + "loss": 152.3152, + "step": 1824, + "task_loss": 3.345848321914673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3553109581613154, + "compression/movement_sparsity/importance_threshold": -0.00044023255766069934, + "compression/movement_sparsity/linear_layer_sparsity": 0.8601871789060139, + "compression/movement_sparsity/model_sparsity": 0.8306371030653088, + "compression_loss": 145.4713592529297, + "distillation_loss": 5.802145957946777, + "epoch": 1.54, + "learning_rate": 3.898079942035986e-05, + "loss": 151.5976, + "step": 1825, + "task_loss": 2.584536075592041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.35610887386899, + "compression/movement_sparsity/importance_threshold": -0.00043780481006956695, + "compression/movement_sparsity/linear_layer_sparsity": 0.8606166516517548, + "compression/movement_sparsity/model_sparsity": 0.8310518221010207, + "compression_loss": 145.55572509765625, + "distillation_loss": 6.697346210479736, + "epoch": 1.54, + "learning_rate": 3.897476150223403e-05, + "loss": 151.6336, + "step": 1826, + "task_loss": 3.2098422050476074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3569038506639421, + "compression/movement_sparsity/importance_threshold": -0.0004353860044483855, + "compression/movement_sparsity/linear_layer_sparsity": 0.8610834470421959, + "compression/movement_sparsity/model_sparsity": 0.8315025816337697, + "compression_loss": 145.6398162841797, + "distillation_loss": 6.716588973999023, + "epoch": 1.54, + "learning_rate": 3.89687235841082e-05, + "loss": 152.0652, + "step": 1827, + "task_loss": 2.6326754093170166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.357695893968521, + "compression/movement_sparsity/importance_threshold": -0.00043297612429905354, + "compression/movement_sparsity/linear_layer_sparsity": 0.8616363826196385, + "compression/movement_sparsity/model_sparsity": 0.8320365221730983, + "compression_loss": 145.7235565185547, + "distillation_loss": 7.821836948394775, + "epoch": 1.54, + "learning_rate": 3.8962685665982377e-05, + "loss": 152.6999, + "step": 1828, + "task_loss": 3.8367185592651367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3584850092050746, + "compression/movement_sparsity/importance_threshold": -0.00043057515312347385, + "compression/movement_sparsity/linear_layer_sparsity": 0.8621652671509595, + "compression/movement_sparsity/model_sparsity": 0.8325472378937293, + "compression_loss": 145.80709838867188, + "distillation_loss": 6.658909320831299, + "epoch": 1.55, + "learning_rate": 3.895664774785654e-05, + "loss": 152.3078, + "step": 1829, + "task_loss": 2.985086679458618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3592712017959512, + "compression/movement_sparsity/importance_threshold": -0.00042818307442354497, + "compression/movement_sparsity/linear_layer_sparsity": 0.8625916396131151, + "compression/movement_sparsity/model_sparsity": 0.8329589631501347, + "compression_loss": 145.89027404785156, + "distillation_loss": 5.07117223739624, + "epoch": 1.55, + "learning_rate": 3.895060982973071e-05, + "loss": 152.3992, + "step": 1830, + "task_loss": 3.29555606842041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3600544771634988, + "compression/movement_sparsity/importance_threshold": -0.000425799871701168, + "compression/movement_sparsity/linear_layer_sparsity": 0.8630750095965701, + "compression/movement_sparsity/model_sparsity": 0.833425727887638, + "compression_loss": 145.9732208251953, + "distillation_loss": 5.778958320617676, + "epoch": 1.55, + "learning_rate": 3.8944571911604885e-05, + "loss": 152.3016, + "step": 1831, + "task_loss": 3.1990954875946045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3608348407300652, + "compression/movement_sparsity/importance_threshold": -0.00042342552845824315, + "compression/movement_sparsity/linear_layer_sparsity": 0.8634221936614559, + "compression/movement_sparsity/model_sparsity": 0.8337609851118318, + "compression_loss": 146.05580139160156, + "distillation_loss": 7.808193683624268, + "epoch": 1.55, + "learning_rate": 3.8938533993479045e-05, + "loss": 152.9909, + "step": 1832, + "task_loss": 4.440708160400391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3616122979179996, + "compression/movement_sparsity/importance_threshold": -0.00042106002819667157, + "compression/movement_sparsity/linear_layer_sparsity": 0.8637997128088073, + "compression/movement_sparsity/model_sparsity": 0.834125535315087, + "compression_loss": 146.13807678222656, + "distillation_loss": 5.318569183349609, + "epoch": 1.55, + "learning_rate": 3.893249607535322e-05, + "loss": 152.3235, + "step": 1833, + "task_loss": 2.386603593826294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3623868541496493, + "compression/movement_sparsity/importance_threshold": -0.0004187033544183535, + "compression/movement_sparsity/linear_layer_sparsity": 0.8642485742511241, + "compression/movement_sparsity/model_sparsity": 0.8345589769860009, + "compression_loss": 146.22006225585938, + "distillation_loss": 6.53436279296875, + "epoch": 1.55, + "learning_rate": 3.892645815722739e-05, + "loss": 152.1619, + "step": 1834, + "task_loss": 3.377082586288452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3631585148473628, + "compression/movement_sparsity/importance_threshold": -0.0004163554906251891, + "compression/movement_sparsity/linear_layer_sparsity": 0.864793115214551, + "compression/movement_sparsity/model_sparsity": 0.8350848112921302, + "compression_loss": 146.3018035888672, + "distillation_loss": 6.998291969299316, + "epoch": 1.55, + "learning_rate": 3.892042023910156e-05, + "loss": 152.5884, + "step": 1835, + "task_loss": 3.403266191482544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3639272854334887, + "compression/movement_sparsity/importance_threshold": -0.0004140164203190787, + "compression/movement_sparsity/linear_layer_sparsity": 0.865275733975445, + "compression/movement_sparsity/model_sparsity": 0.8355508506138785, + "compression_loss": 146.38316345214844, + "distillation_loss": 7.559309959411621, + "epoch": 1.55, + "learning_rate": 3.891438232097573e-05, + "loss": 153.0688, + "step": 1836, + "task_loss": 3.664708137512207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3646931713303745, + "compression/movement_sparsity/importance_threshold": -0.0004116861270019234, + "compression/movement_sparsity/linear_layer_sparsity": 0.8656975991022342, + "compression/movement_sparsity/model_sparsity": 0.8359582233757535, + "compression_loss": 146.46429443359375, + "distillation_loss": 7.0397539138793945, + "epoch": 1.55, + "learning_rate": 3.89083444028499e-05, + "loss": 152.5111, + "step": 1837, + "task_loss": 2.7290971279144287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3654561779603689, + "compression/movement_sparsity/importance_threshold": -0.0004093645941756225, + "compression/movement_sparsity/linear_layer_sparsity": 0.8660820223426468, + "compression/movement_sparsity/model_sparsity": 0.8363294404952337, + "compression_loss": 146.54501342773438, + "distillation_loss": 7.92360782623291, + "epoch": 1.55, + "learning_rate": 3.890230648472407e-05, + "loss": 153.3794, + "step": 1838, + "task_loss": 4.660275459289551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3662163107458198, + "compression/movement_sparsity/importance_threshold": -0.00040705180534207804, + "compression/movement_sparsity/linear_layer_sparsity": 0.8664540206003828, + "compression/movement_sparsity/model_sparsity": 0.836688659468416, + "compression_loss": 146.62538146972656, + "distillation_loss": 6.339056968688965, + "epoch": 1.55, + "learning_rate": 3.8896268566598236e-05, + "loss": 153.2743, + "step": 1839, + "task_loss": 2.3425607681274414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3669735751090755, + "compression/movement_sparsity/importance_threshold": -0.00040474774400318936, + "compression/movement_sparsity/linear_layer_sparsity": 0.8669722449258374, + "compression/movement_sparsity/model_sparsity": 0.8371890811940468, + "compression_loss": 146.7054443359375, + "distillation_loss": 5.48922061920166, + "epoch": 1.56, + "learning_rate": 3.889023064847241e-05, + "loss": 151.9742, + "step": 1840, + "task_loss": 2.817523241043091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3677279764724841, + "compression/movement_sparsity/importance_threshold": -0.0004024523936608567, + "compression/movement_sparsity/linear_layer_sparsity": 0.8674021946382837, + "compression/movement_sparsity/model_sparsity": 0.8376042608111905, + "compression_loss": 146.7852783203125, + "distillation_loss": 7.308719635009766, + "epoch": 1.56, + "learning_rate": 3.888419273034658e-05, + "loss": 153.5786, + "step": 1841, + "task_loss": 3.795900344848633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3684795202583941, + "compression/movement_sparsity/importance_threshold": -0.00040016573781698206, + "compression/movement_sparsity/linear_layer_sparsity": 0.8678654962476076, + "compression/movement_sparsity/model_sparsity": 0.8380516465849516, + "compression_loss": 146.86483764648438, + "distillation_loss": 7.258925914764404, + "epoch": 1.56, + "learning_rate": 3.8878154812220744e-05, + "loss": 153.6335, + "step": 1842, + "task_loss": 3.5141148567199707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3692282118891534, + "compression/movement_sparsity/importance_threshold": -0.0003978877599734639, + "compression/movement_sparsity/linear_layer_sparsity": 0.8683738592864273, + "compression/movement_sparsity/model_sparsity": 0.8385425457894803, + "compression_loss": 146.94386291503906, + "distillation_loss": 8.211523056030273, + "epoch": 1.56, + "learning_rate": 3.887211689409492e-05, + "loss": 153.0897, + "step": 1843, + "task_loss": 3.171069383621216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3699740567871104, + "compression/movement_sparsity/importance_threshold": -0.00039561844363220425, + "compression/movement_sparsity/linear_layer_sparsity": 0.8688186068909097, + "compression/movement_sparsity/model_sparsity": 0.8389720149455451, + "compression_loss": 147.02268981933594, + "distillation_loss": 6.05572509765625, + "epoch": 1.56, + "learning_rate": 3.886607897596909e-05, + "loss": 153.6951, + "step": 1844, + "task_loss": 2.3466174602508545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3707170603746128, + "compression/movement_sparsity/importance_threshold": -0.0003933577722951033, + "compression/movement_sparsity/linear_layer_sparsity": 0.8692269142390969, + "compression/movement_sparsity/model_sparsity": 0.8393662956802217, + "compression_loss": 147.1012420654297, + "distillation_loss": 5.430069923400879, + "epoch": 1.56, + "learning_rate": 3.886004105784325e-05, + "loss": 152.8183, + "step": 1845, + "task_loss": 3.2519054412841797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3714572280740096, + "compression/movement_sparsity/importance_threshold": -0.00039110572946406044, + "compression/movement_sparsity/linear_layer_sparsity": 0.86960018838277, + "compression/movement_sparsity/model_sparsity": 0.839726746708734, + "compression_loss": 147.17938232421875, + "distillation_loss": 6.71021032333374, + "epoch": 1.56, + "learning_rate": 3.8854003139717427e-05, + "loss": 154.1356, + "step": 1846, + "task_loss": 2.3991525173187256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3721945653076486, + "compression/movement_sparsity/importance_threshold": -0.00038886229864097677, + "compression/movement_sparsity/linear_layer_sparsity": 0.870217001726238, + "compression/movement_sparsity/model_sparsity": 0.840322370616314, + "compression_loss": 147.25733947753906, + "distillation_loss": 6.815332889556885, + "epoch": 1.56, + "learning_rate": 3.88479652215916e-05, + "loss": 153.8392, + "step": 1847, + "task_loss": 3.1035261154174805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.372929077497878, + "compression/movement_sparsity/importance_threshold": -0.0003866274633277534, + "compression/movement_sparsity/linear_layer_sparsity": 0.8706333459634117, + "compression/movement_sparsity/model_sparsity": 0.8407244121481162, + "compression_loss": 147.3348388671875, + "distillation_loss": 6.4255852699279785, + "epoch": 1.56, + "learning_rate": 3.884192730346577e-05, + "loss": 153.221, + "step": 1848, + "task_loss": 1.9447343349456787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3736607700670453, + "compression/movement_sparsity/importance_threshold": -0.00038440120702629057, + "compression/movement_sparsity/linear_layer_sparsity": 0.8710746951801179, + "compression/movement_sparsity/model_sparsity": 0.8411505996614795, + "compression_loss": 147.41209411621094, + "distillation_loss": 4.901268482208252, + "epoch": 1.56, + "learning_rate": 3.8835889385339935e-05, + "loss": 153.0275, + "step": 1849, + "task_loss": 3.0868444442749023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3743896484375, + "compression/movement_sparsity/importance_threshold": -0.00038218351323848765, + "compression/movement_sparsity/linear_layer_sparsity": 0.8713199991567229, + "compression/movement_sparsity/model_sparsity": 0.8413874766918449, + "compression_loss": 147.48895263671875, + "distillation_loss": 7.811885833740234, + "epoch": 1.56, + "learning_rate": 3.882985146721411e-05, + "loss": 153.5925, + "step": 1850, + "task_loss": 4.08037805557251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3751157180315896, + "compression/movement_sparsity/importance_threshold": -0.00037997436546624574, + "compression/movement_sparsity/linear_layer_sparsity": 0.8717392051941293, + "compression/movement_sparsity/model_sparsity": 0.8417922817122377, + "compression_loss": 147.5656280517578, + "distillation_loss": 6.7906174659729, + "epoch": 1.56, + "learning_rate": 3.8823813549088276e-05, + "loss": 153.3325, + "step": 1851, + "task_loss": 3.44189453125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.375838984271662, + "compression/movement_sparsity/importance_threshold": -0.00037777374721146683, + "compression/movement_sparsity/linear_layer_sparsity": 0.8721336328111884, + "compression/movement_sparsity/model_sparsity": 0.8421731595272496, + "compression_loss": 147.64190673828125, + "distillation_loss": 6.649721145629883, + "epoch": 1.57, + "learning_rate": 3.881777563096244e-05, + "loss": 153.2356, + "step": 1852, + "task_loss": 3.3841075897216797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.376559452580066, + "compression/movement_sparsity/importance_threshold": -0.0003755816419760494, + "compression/movement_sparsity/linear_layer_sparsity": 0.8725386967857786, + "compression/movement_sparsity/model_sparsity": 0.8425643083081901, + "compression_loss": 147.71788024902344, + "distillation_loss": 5.944565773010254, + "epoch": 1.57, + "learning_rate": 3.881173771283662e-05, + "loss": 154.516, + "step": 1853, + "task_loss": 3.51729154586792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3772771283791496, + "compression/movement_sparsity/importance_threshold": -0.0003733980332618937, + "compression/movement_sparsity/linear_layer_sparsity": 0.8729470279823011, + "compression/movement_sparsity/model_sparsity": 0.8429586120719383, + "compression_loss": 147.79364013671875, + "distillation_loss": 6.437558174133301, + "epoch": 1.57, + "learning_rate": 3.8805699794710784e-05, + "loss": 154.7225, + "step": 1854, + "task_loss": 3.6579341888427734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.377992017091261, + "compression/movement_sparsity/importance_threshold": -0.00037122290457090087, + "compression/movement_sparsity/linear_layer_sparsity": 0.8733794579216156, + "compression/movement_sparsity/model_sparsity": 0.8433761867125273, + "compression_loss": 147.8690185546875, + "distillation_loss": 6.866792678833008, + "epoch": 1.57, + "learning_rate": 3.879966187658495e-05, + "loss": 154.4009, + "step": 1855, + "task_loss": 3.614643096923828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3787041241387479, + "compression/movement_sparsity/importance_threshold": -0.00036905623940497283, + "compression/movement_sparsity/linear_layer_sparsity": 0.8736728997629666, + "compression/movement_sparsity/model_sparsity": 0.8436595479238952, + "compression_loss": 147.94430541992188, + "distillation_loss": 5.848559856414795, + "epoch": 1.57, + "learning_rate": 3.8793623958459125e-05, + "loss": 153.4799, + "step": 1856, + "task_loss": 2.795853853225708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3794134549439594, + "compression/movement_sparsity/importance_threshold": -0.0003668980212660081, + "compression/movement_sparsity/linear_layer_sparsity": 0.8739568380427118, + "compression/movement_sparsity/model_sparsity": 0.8439337320502348, + "compression_loss": 148.01904296875, + "distillation_loss": 8.760519027709961, + "epoch": 1.57, + "learning_rate": 3.878758604033329e-05, + "loss": 155.183, + "step": 1857, + "task_loss": 3.912856101989746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.380120014929243, + "compression/movement_sparsity/importance_threshold": -0.00036474823365590693, + "compression/movement_sparsity/linear_layer_sparsity": 0.8742087241598517, + "compression/movement_sparsity/model_sparsity": 0.8441769651043587, + "compression_loss": 148.0936279296875, + "distillation_loss": 8.316190719604492, + "epoch": 1.57, + "learning_rate": 3.878154812220747e-05, + "loss": 155.7073, + "step": 1858, + "task_loss": 4.517300605773926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3808238095169467, + "compression/movement_sparsity/importance_threshold": -0.00036260686007657215, + "compression/movement_sparsity/linear_layer_sparsity": 0.8745523190502792, + "compression/movement_sparsity/model_sparsity": 0.8445087564532784, + "compression_loss": 148.16793823242188, + "distillation_loss": 5.306129455566406, + "epoch": 1.57, + "learning_rate": 3.8775510204081634e-05, + "loss": 154.5206, + "step": 1859, + "task_loss": 3.934784412384033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.38152484412942, + "compression/movement_sparsity/importance_threshold": -0.0003604738840299014, + "compression/movement_sparsity/linear_layer_sparsity": 0.874915743831485, + "compression/movement_sparsity/model_sparsity": 0.8448596964752244, + "compression_loss": 148.24195861816406, + "distillation_loss": 5.32120418548584, + "epoch": 1.57, + "learning_rate": 3.876947228595581e-05, + "loss": 155.0777, + "step": 1860, + "task_loss": 3.496925115585327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3822231241890095, + "compression/movement_sparsity/importance_threshold": -0.00035834928901779667, + "compression/movement_sparsity/linear_layer_sparsity": 0.8752382329451969, + "compression/movement_sparsity/model_sparsity": 0.8451711070957878, + "compression_loss": 148.31578063964844, + "distillation_loss": 6.774896621704102, + "epoch": 1.57, + "learning_rate": 3.8763434367829975e-05, + "loss": 154.906, + "step": 1861, + "task_loss": 2.6619465351104736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3829186551180643, + "compression/movement_sparsity/importance_threshold": -0.00035623305854215904, + "compression/movement_sparsity/linear_layer_sparsity": 0.875502067078308, + "compression/movement_sparsity/model_sparsity": 0.8454258777147778, + "compression_loss": 148.3892822265625, + "distillation_loss": 8.568330764770508, + "epoch": 1.57, + "learning_rate": 3.875739644970414e-05, + "loss": 155.346, + "step": 1862, + "task_loss": 3.8568685054779053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3836114423389325, + "compression/movement_sparsity/importance_threshold": -0.000354125176104887, + "compression/movement_sparsity/linear_layer_sparsity": 0.8759798207788065, + "compression/movement_sparsity/model_sparsity": 0.8458872191059219, + "compression_loss": 148.4624481201172, + "distillation_loss": 5.960892200469971, + "epoch": 1.57, + "learning_rate": 3.8751358531578316e-05, + "loss": 155.2434, + "step": 1863, + "task_loss": 2.86572527885437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3843014912739622, + "compression/movement_sparsity/importance_threshold": -0.0003520256252078826, + "compression/movement_sparsity/linear_layer_sparsity": 0.8763086297013655, + "compression/movement_sparsity/model_sparsity": 0.8462047324304564, + "compression_loss": 148.53521728515625, + "distillation_loss": 7.4248948097229, + "epoch": 1.58, + "learning_rate": 3.874532061345248e-05, + "loss": 155.5059, + "step": 1864, + "task_loss": 4.450930118560791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3849888073455014, + "compression/movement_sparsity/importance_threshold": -0.0003499343893530451, + "compression/movement_sparsity/linear_layer_sparsity": 0.8766158439563358, + "compression/movement_sparsity/model_sparsity": 0.8465013929306668, + "compression_loss": 148.6078643798828, + "distillation_loss": 6.729351043701172, + "epoch": 1.58, + "learning_rate": 3.873928269532665e-05, + "loss": 155.0728, + "step": 1865, + "task_loss": 3.897453784942627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3856733959758984, + "compression/movement_sparsity/importance_threshold": -0.0003478514520422766, + "compression/movement_sparsity/linear_layer_sparsity": 0.877021301428458, + "compression/movement_sparsity/model_sparsity": 0.8468929216912886, + "compression_loss": 148.68002319335938, + "distillation_loss": 7.422763347625732, + "epoch": 1.58, + "learning_rate": 3.8733244777200824e-05, + "loss": 154.481, + "step": 1866, + "task_loss": 2.692875862121582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3863552625875017, + "compression/movement_sparsity/importance_threshold": -0.00034577679677747636, + "compression/movement_sparsity/linear_layer_sparsity": 0.8774258288155047, + "compression/movement_sparsity/model_sparsity": 0.8472835523181185, + "compression_loss": 148.7519989013672, + "distillation_loss": 7.419010639190674, + "epoch": 1.58, + "learning_rate": 3.872720685907499e-05, + "loss": 155.4015, + "step": 1867, + "task_loss": 3.4218645095825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3870344126026593, + "compression/movement_sparsity/importance_threshold": -0.00034371040706054554, + "compression/movement_sparsity/linear_layer_sparsity": 0.8778272797673013, + "compression/movement_sparsity/model_sparsity": 0.8476712121947132, + "compression_loss": 148.82383728027344, + "distillation_loss": 7.416689872741699, + "epoch": 1.58, + "learning_rate": 3.8721168940949166e-05, + "loss": 155.3377, + "step": 1868, + "task_loss": 2.8182859420776367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3877108514437193, + "compression/movement_sparsity/importance_threshold": -0.0003416522663933835, + "compression/movement_sparsity/linear_layer_sparsity": 0.8781502577718863, + "compression/movement_sparsity/model_sparsity": 0.8479830949112441, + "compression_loss": 148.89508056640625, + "distillation_loss": 6.940967559814453, + "epoch": 1.58, + "learning_rate": 3.871513102282333e-05, + "loss": 155.2136, + "step": 1869, + "task_loss": 3.554002285003662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3883845845330298, + "compression/movement_sparsity/importance_threshold": -0.0003396023582778922, + "compression/movement_sparsity/linear_layer_sparsity": 0.8784544432882772, + "compression/movement_sparsity/model_sparsity": 0.8482768307193629, + "compression_loss": 148.96621704101562, + "distillation_loss": 7.293410301208496, + "epoch": 1.58, + "learning_rate": 3.87090931046975e-05, + "loss": 155.2493, + "step": 1870, + "task_loss": 3.9629340171813965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3890556172929394, + "compression/movement_sparsity/importance_threshold": -0.0003375606662159702, + "compression/movement_sparsity/linear_layer_sparsity": 0.8786245176912675, + "compression/movement_sparsity/model_sparsity": 0.8484410625434005, + "compression_loss": 149.03701782226562, + "distillation_loss": 6.5900654792785645, + "epoch": 1.58, + "learning_rate": 3.8703055186571674e-05, + "loss": 155.2219, + "step": 1871, + "task_loss": 3.0699574947357178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3897239551457958, + "compression/movement_sparsity/importance_threshold": -0.00033552717370952025, + "compression/movement_sparsity/linear_layer_sparsity": 0.8789879782449762, + "compression/movement_sparsity/model_sparsity": 0.8487920371089538, + "compression_loss": 149.1076202392578, + "distillation_loss": 6.195268154144287, + "epoch": 1.58, + "learning_rate": 3.869701726844584e-05, + "loss": 155.1058, + "step": 1872, + "task_loss": 3.514054298400879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3903896035139476, + "compression/movement_sparsity/importance_threshold": -0.00033350186426044006, + "compression/movement_sparsity/linear_layer_sparsity": 0.8794589947907604, + "compression/movement_sparsity/model_sparsity": 0.849246872787374, + "compression_loss": 149.17779541015625, + "distillation_loss": 7.014634609222412, + "epoch": 1.58, + "learning_rate": 3.869097935032001e-05, + "loss": 156.168, + "step": 1873, + "task_loss": 3.4906058311462402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.391052567819743, + "compression/movement_sparsity/importance_threshold": -0.00033148472137063244, + "compression/movement_sparsity/linear_layer_sparsity": 0.8797602827344158, + "compression/movement_sparsity/model_sparsity": 0.8495378105632947, + "compression_loss": 149.24778747558594, + "distillation_loss": 5.868443489074707, + "epoch": 1.58, + "learning_rate": 3.868494143219418e-05, + "loss": 155.0878, + "step": 1874, + "task_loss": 3.1042256355285645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3917128534855299, + "compression/movement_sparsity/importance_threshold": -0.0003294757285419968, + "compression/movement_sparsity/linear_layer_sparsity": 0.8801181866260062, + "compression/movement_sparsity/model_sparsity": 0.8498834193551678, + "compression_loss": 149.31739807128906, + "distillation_loss": 6.940214157104492, + "epoch": 1.58, + "learning_rate": 3.867890351406835e-05, + "loss": 155.6364, + "step": 1875, + "task_loss": 3.999532699584961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3923704659336564, + "compression/movement_sparsity/importance_threshold": -0.00032747486927643416, + "compression/movement_sparsity/linear_layer_sparsity": 0.8805425916005019, + "compression/movement_sparsity/model_sparsity": 0.8502932447131671, + "compression_loss": 149.38674926757812, + "distillation_loss": 6.561287879943848, + "epoch": 1.59, + "learning_rate": 3.867286559594252e-05, + "loss": 155.3528, + "step": 1876, + "task_loss": 2.372581958770752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3930254105864712, + "compression/movement_sparsity/importance_threshold": -0.00032548212707584484, + "compression/movement_sparsity/linear_layer_sparsity": 0.8809120738588667, + "compression/movement_sparsity/model_sparsity": 0.8506500341192967, + "compression_loss": 149.45570373535156, + "distillation_loss": 7.122910499572754, + "epoch": 1.59, + "learning_rate": 3.866682767781669e-05, + "loss": 156.1486, + "step": 1877, + "task_loss": 2.7499966621398926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3936776928663221, + "compression/movement_sparsity/importance_threshold": -0.00032349748544212905, + "compression/movement_sparsity/linear_layer_sparsity": 0.8813811706136615, + "compression/movement_sparsity/model_sparsity": 0.851103015957454, + "compression_loss": 149.52439880371094, + "distillation_loss": 5.738148212432861, + "epoch": 1.59, + "learning_rate": 3.8660789759690865e-05, + "loss": 155.0254, + "step": 1878, + "task_loss": 3.077583074569702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3943273181955573, + "compression/movement_sparsity/importance_threshold": -0.00032152092787718703, + "compression/movement_sparsity/linear_layer_sparsity": 0.8817419720779875, + "compression/movement_sparsity/model_sparsity": 0.8514514227815252, + "compression_loss": 149.5928955078125, + "distillation_loss": 7.894509315490723, + "epoch": 1.59, + "learning_rate": 3.865475184156503e-05, + "loss": 156.6239, + "step": 1879, + "task_loss": 4.385012626647949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3949742919965253, + "compression/movement_sparsity/importance_threshold": -0.0003195524378829199, + "compression/movement_sparsity/linear_layer_sparsity": 0.8820942119899509, + "compression/movement_sparsity/model_sparsity": 0.851791562168896, + "compression_loss": 149.6609649658203, + "distillation_loss": 6.579346656799316, + "epoch": 1.59, + "learning_rate": 3.86487139234392e-05, + "loss": 155.8185, + "step": 1880, + "task_loss": 2.5204596519470215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.395618619691574, + "compression/movement_sparsity/importance_threshold": -0.00031759199896122785, + "compression/movement_sparsity/linear_layer_sparsity": 0.882432739109133, + "compression/movement_sparsity/model_sparsity": 0.8521184598401029, + "compression_loss": 149.72869873046875, + "distillation_loss": 5.737729072570801, + "epoch": 1.59, + "learning_rate": 3.864267600531337e-05, + "loss": 155.3987, + "step": 1881, + "task_loss": 3.7331557273864746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3962603067030517, + "compression/movement_sparsity/importance_threshold": -0.0003156395946140103, + "compression/movement_sparsity/linear_layer_sparsity": 0.882787089598768, + "compression/movement_sparsity/model_sparsity": 0.8524606373003093, + "compression_loss": 149.79624938964844, + "distillation_loss": 6.429390907287598, + "epoch": 1.59, + "learning_rate": 3.863663808718754e-05, + "loss": 155.9553, + "step": 1882, + "task_loss": 2.23868989944458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3968993584533067, + "compression/movement_sparsity/importance_threshold": -0.0003136952083431692, + "compression/movement_sparsity/linear_layer_sparsity": 0.8832131401083974, + "compression/movement_sparsity/model_sparsity": 0.8528720516642483, + "compression_loss": 149.86355590820312, + "distillation_loss": 6.766284465789795, + "epoch": 1.59, + "learning_rate": 3.863060016906171e-05, + "loss": 155.9906, + "step": 1883, + "task_loss": 3.0568654537200928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3975357803646873, + "compression/movement_sparsity/importance_threshold": -0.00031175882365060484, + "compression/movement_sparsity/linear_layer_sparsity": 0.8834918079060475, + "compression/movement_sparsity/model_sparsity": 0.8531411463657667, + "compression_loss": 149.9305877685547, + "distillation_loss": 6.789269924163818, + "epoch": 1.59, + "learning_rate": 3.862456225093588e-05, + "loss": 156.1786, + "step": 1884, + "task_loss": 3.200939416885376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.398169577859541, + "compression/movement_sparsity/importance_threshold": -0.0003098304240382165, + "compression/movement_sparsity/linear_layer_sparsity": 0.8839675225738802, + "compression/movement_sparsity/model_sparsity": 0.8536005187712898, + "compression_loss": 149.99729919433594, + "distillation_loss": 5.876088619232178, + "epoch": 1.59, + "learning_rate": 3.861852433281005e-05, + "loss": 156.4167, + "step": 1885, + "task_loss": 3.2096548080444336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3988007563602167, + "compression/movement_sparsity/importance_threshold": -0.00030790999300790624, + "compression/movement_sparsity/linear_layer_sparsity": 0.8843559881071213, + "compression/movement_sparsity/model_sparsity": 0.8539756393184045, + "compression_loss": 150.06370544433594, + "distillation_loss": 5.651353359222412, + "epoch": 1.59, + "learning_rate": 3.8612486414684216e-05, + "loss": 155.6909, + "step": 1886, + "task_loss": 2.902829170227051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.3994293212890625, + "compression/movement_sparsity/importance_threshold": -0.0003059975140615734, + "compression/movement_sparsity/linear_layer_sparsity": 0.8847507019042037, + "compression/movement_sparsity/model_sparsity": 0.8543567934822753, + "compression_loss": 150.1298065185547, + "distillation_loss": 6.559399604797363, + "epoch": 1.59, + "learning_rate": 3.860644849655839e-05, + "loss": 156.243, + "step": 1887, + "task_loss": 3.0456230640411377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4000552780684263, + "compression/movement_sparsity/importance_threshold": -0.00030409297070111904, + "compression/movement_sparsity/linear_layer_sparsity": 0.8850197945952423, + "compression/movement_sparsity/model_sparsity": 0.8546166420115507, + "compression_loss": 150.1956024169922, + "distillation_loss": 6.62272310256958, + "epoch": 1.6, + "learning_rate": 3.8600410578432563e-05, + "loss": 156.3165, + "step": 1888, + "task_loss": 2.7173609733581543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4006786321206566, + "compression/movement_sparsity/importance_threshold": -0.00030219634642844346, + "compression/movement_sparsity/linear_layer_sparsity": 0.8852501218172966, + "compression/movement_sparsity/model_sparsity": 0.8548390567849581, + "compression_loss": 150.26124572753906, + "distillation_loss": 5.735922813415527, + "epoch": 1.6, + "learning_rate": 3.8594372660306724e-05, + "loss": 156.0371, + "step": 1889, + "task_loss": 3.3753914833068848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4012993888681013, + "compression/movement_sparsity/importance_threshold": -0.000300307624745446, + "compression/movement_sparsity/linear_layer_sparsity": 0.8854829769628897, + "compression/movement_sparsity/model_sparsity": 0.8550639126399538, + "compression_loss": 150.32656860351562, + "distillation_loss": 6.453309059143066, + "epoch": 1.6, + "learning_rate": 3.85883347421809e-05, + "loss": 156.4494, + "step": 1890, + "task_loss": 3.215193748474121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4019175537331088, + "compression/movement_sparsity/importance_threshold": -0.00029842678915402866, + "compression/movement_sparsity/linear_layer_sparsity": 0.8857196716904616, + "compression/movement_sparsity/model_sparsity": 0.8552924761754755, + "compression_loss": 150.39154052734375, + "distillation_loss": 6.791268348693848, + "epoch": 1.6, + "learning_rate": 3.858229682405507e-05, + "loss": 156.3278, + "step": 1891, + "task_loss": 3.4626686573028564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4025331321380274, + "compression/movement_sparsity/importance_threshold": -0.00029655382315609165, + "compression/movement_sparsity/linear_layer_sparsity": 0.8860986455862646, + "compression/movement_sparsity/model_sparsity": 0.8556584311520976, + "compression_loss": 150.4563446044922, + "distillation_loss": 6.4242658615112305, + "epoch": 1.6, + "learning_rate": 3.857625890592924e-05, + "loss": 157.1079, + "step": 1892, + "task_loss": 2.8037991523742676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.403146129505205, + "compression/movement_sparsity/importance_threshold": -0.0002946887102535352, + "compression/movement_sparsity/linear_layer_sparsity": 0.8864198230415365, + "compression/movement_sparsity/model_sparsity": 0.8559685751737235, + "compression_loss": 150.5208282470703, + "distillation_loss": 6.069797515869141, + "epoch": 1.6, + "learning_rate": 3.8570220987803406e-05, + "loss": 156.7057, + "step": 1893, + "task_loss": 3.1066486835479736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.40375655125699, + "compression/movement_sparsity/importance_threshold": -0.0002928314339482596, + "compression/movement_sparsity/linear_layer_sparsity": 0.8867824966001813, + "compression/movement_sparsity/model_sparsity": 0.8563187897799145, + "compression_loss": 150.58505249023438, + "distillation_loss": 5.87023401260376, + "epoch": 1.6, + "learning_rate": 3.856418306967758e-05, + "loss": 156.1895, + "step": 1894, + "task_loss": 3.146287679672241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4043644028157305, + "compression/movement_sparsity/importance_threshold": -0.00029098197774216506, + "compression/movement_sparsity/linear_layer_sparsity": 0.8870396055027459, + "compression/movement_sparsity/model_sparsity": 0.8565670662007164, + "compression_loss": 150.6488494873047, + "distillation_loss": 5.058915615081787, + "epoch": 1.6, + "learning_rate": 3.855814515155175e-05, + "loss": 157.0146, + "step": 1895, + "task_loss": 3.167494773864746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4049696896037747, + "compression/movement_sparsity/importance_threshold": -0.0002891403251371527, + "compression/movement_sparsity/linear_layer_sparsity": 0.8873788599961537, + "compression/movement_sparsity/model_sparsity": 0.8568946662586069, + "compression_loss": 150.7125244140625, + "distillation_loss": 5.118819236755371, + "epoch": 1.6, + "learning_rate": 3.8552107233425914e-05, + "loss": 156.6973, + "step": 1896, + "task_loss": 2.721879005432129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.405572417043471, + "compression/movement_sparsity/importance_threshold": -0.0002873064596351227, + "compression/movement_sparsity/linear_layer_sparsity": 0.8877576192569393, + "compression/movement_sparsity/model_sparsity": 0.8572604139735847, + "compression_loss": 150.77581787109375, + "distillation_loss": 5.839515686035156, + "epoch": 1.6, + "learning_rate": 3.854606931530009e-05, + "loss": 156.4697, + "step": 1897, + "task_loss": 3.584444522857666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4061725905571674, + "compression/movement_sparsity/importance_threshold": -0.0002854803647379762, + "compression/movement_sparsity/linear_layer_sparsity": 0.8880939642534441, + "compression/movement_sparsity/model_sparsity": 0.8575852044847413, + "compression_loss": 150.83892822265625, + "distillation_loss": 6.333454132080078, + "epoch": 1.6, + "learning_rate": 3.8540031397174256e-05, + "loss": 156.2799, + "step": 1898, + "task_loss": 2.3072192668914795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4067702155672117, + "compression/movement_sparsity/importance_threshold": -0.0002836620239476126, + "compression/movement_sparsity/linear_layer_sparsity": 0.8883948586995675, + "compression/movement_sparsity/model_sparsity": 0.8578757622809807, + "compression_loss": 150.90158081054688, + "distillation_loss": 6.045659065246582, + "epoch": 1.6, + "learning_rate": 3.853399347904842e-05, + "loss": 157.1078, + "step": 1899, + "task_loss": 2.536705255508423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4073652974959527, + "compression/movement_sparsity/importance_threshold": -0.00028185142076593214, + "compression/movement_sparsity/linear_layer_sparsity": 0.8888079595631443, + "compression/movement_sparsity/model_sparsity": 0.8582746718590468, + "compression_loss": 150.9640655517578, + "distillation_loss": 5.64579963684082, + "epoch": 1.61, + "learning_rate": 3.85279555609226e-05, + "loss": 156.3829, + "step": 1900, + "task_loss": 3.3389205932617188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4079578417657388, + "compression/movement_sparsity/importance_threshold": -0.0002800485386948359, + "compression/movement_sparsity/linear_layer_sparsity": 0.8892509066183137, + "compression/movement_sparsity/model_sparsity": 0.8587024023202066, + "compression_loss": 151.02621459960938, + "distillation_loss": 6.069151878356934, + "epoch": 1.61, + "learning_rate": 3.852191764279677e-05, + "loss": 156.895, + "step": 1901, + "task_loss": 3.4109585285186768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4085478537989173, + "compression/movement_sparsity/importance_threshold": -0.00027825336123622497, + "compression/movement_sparsity/linear_layer_sparsity": 0.8894739243256072, + "compression/movement_sparsity/model_sparsity": 0.8589177586831719, + "compression_loss": 151.0880889892578, + "distillation_loss": 9.036394119262695, + "epoch": 1.61, + "learning_rate": 3.851587972467093e-05, + "loss": 158.418, + "step": 1902, + "task_loss": 3.5200376510620117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4091353390178372, + "compression/movement_sparsity/importance_threshold": -0.00027646587189199875, + "compression/movement_sparsity/linear_layer_sparsity": 0.8897052889502459, + "compression/movement_sparsity/model_sparsity": 0.8591411752211934, + "compression_loss": 151.1497802734375, + "distillation_loss": 6.922142028808594, + "epoch": 1.61, + "learning_rate": 3.8509841806545105e-05, + "loss": 156.95, + "step": 1903, + "task_loss": 4.287223815917969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4097203028448462, + "compression/movement_sparsity/importance_threshold": -0.0002746860541640592, + "compression/movement_sparsity/linear_layer_sparsity": 0.890032058840139, + "compression/movement_sparsity/model_sparsity": 0.8594567195601069, + "compression_loss": 151.21107482910156, + "distillation_loss": 7.750855922698975, + "epoch": 1.61, + "learning_rate": 3.850380388841928e-05, + "loss": 157.382, + "step": 1904, + "task_loss": 3.9015185832977295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.410302750702293, + "compression/movement_sparsity/importance_threshold": -0.00027291389155430397, + "compression/movement_sparsity/linear_layer_sparsity": 0.8901319118199218, + "compression/movement_sparsity/model_sparsity": 0.8595531422828504, + "compression_loss": 151.2722930908203, + "distillation_loss": 5.996115684509277, + "epoch": 1.61, + "learning_rate": 3.849776597029344e-05, + "loss": 157.5897, + "step": 1905, + "task_loss": 3.646399974822998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4108826880125251, + "compression/movement_sparsity/importance_threshold": -0.00027114936756463676, + "compression/movement_sparsity/linear_layer_sparsity": 0.8905521314115772, + "compression/movement_sparsity/model_sparsity": 0.8599589260387859, + "compression_loss": 151.33309936523438, + "distillation_loss": 5.609531402587891, + "epoch": 1.61, + "learning_rate": 3.8491728052167613e-05, + "loss": 156.9539, + "step": 1906, + "task_loss": 2.981043815612793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4114601201978914, + "compression/movement_sparsity/importance_threshold": -0.0002693924656969552, + "compression/movement_sparsity/linear_layer_sparsity": 0.8908761110462436, + "compression/movement_sparsity/model_sparsity": 0.8602717759763235, + "compression_loss": 151.39352416992188, + "distillation_loss": 8.969968795776367, + "epoch": 1.61, + "learning_rate": 3.848569013404179e-05, + "loss": 158.0768, + "step": 1907, + "task_loss": 3.391901731491089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4120350526807395, + "compression/movement_sparsity/importance_threshold": -0.00026764316945316214, + "compression/movement_sparsity/linear_layer_sparsity": 0.891250816090033, + "compression/movement_sparsity/model_sparsity": 0.8606336087491312, + "compression_loss": 151.4539337158203, + "distillation_loss": 8.696922302246094, + "epoch": 1.61, + "learning_rate": 3.8479652215915955e-05, + "loss": 157.4791, + "step": 1908, + "task_loss": 3.6983954906463623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.412607490883418, + "compression/movement_sparsity/importance_threshold": -0.0002659014623351561, + "compression/movement_sparsity/linear_layer_sparsity": 0.8915811870785523, + "compression/movement_sparsity/model_sparsity": 0.8609526304778548, + "compression_loss": 151.513916015625, + "distillation_loss": 6.751202583312988, + "epoch": 1.61, + "learning_rate": 3.847361429779012e-05, + "loss": 157.8596, + "step": 1909, + "task_loss": 2.997176170349121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.413177440228275, + "compression/movement_sparsity/importance_threshold": -0.000264167327844839, + "compression/movement_sparsity/linear_layer_sparsity": 0.8919303147827626, + "compression/movement_sparsity/model_sparsity": 0.8612897645713831, + "compression_loss": 151.57373046875, + "distillation_loss": 5.813370704650879, + "epoch": 1.61, + "learning_rate": 3.8467576379664296e-05, + "loss": 157.6691, + "step": 1910, + "task_loss": 2.9291272163391113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4137449061376586, + "compression/movement_sparsity/importance_threshold": -0.00026244074948411113, + "compression/movement_sparsity/linear_layer_sparsity": 0.892104872672784, + "compression/movement_sparsity/model_sparsity": 0.8614583258608793, + "compression_loss": 151.6332244873047, + "distillation_loss": 7.464130401611328, + "epoch": 1.61, + "learning_rate": 3.846153846153846e-05, + "loss": 158.2357, + "step": 1911, + "task_loss": 3.74951171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4143098940339172, + "compression/movement_sparsity/importance_threshold": -0.00026072171075487184, + "compression/movement_sparsity/linear_layer_sparsity": 0.892312651293839, + "compression/movement_sparsity/model_sparsity": 0.8616589666470993, + "compression_loss": 151.69247436523438, + "distillation_loss": 5.525993824005127, + "epoch": 1.62, + "learning_rate": 3.845550054341263e-05, + "loss": 157.0393, + "step": 1912, + "task_loss": 2.8973495960235596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4148724093393985, + "compression/movement_sparsity/importance_threshold": -0.00025901019515902225, + "compression/movement_sparsity/linear_layer_sparsity": 0.892653539398213, + "compression/movement_sparsity/model_sparsity": 0.8619881441963936, + "compression_loss": 151.7513885498047, + "distillation_loss": 6.149102210998535, + "epoch": 1.62, + "learning_rate": 3.8449462625286804e-05, + "loss": 158.2012, + "step": 1913, + "task_loss": 3.071350574493408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4154324574764514, + "compression/movement_sparsity/importance_threshold": -0.0002573061861984626, + "compression/movement_sparsity/linear_layer_sparsity": 0.8929730116975131, + "compression/movement_sparsity/model_sparsity": 0.8622966416394009, + "compression_loss": 151.8101806640625, + "distillation_loss": 6.122520446777344, + "epoch": 1.62, + "learning_rate": 3.844342470716097e-05, + "loss": 157.5365, + "step": 1914, + "task_loss": 3.0856828689575195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4159900438674236, + "compression/movement_sparsity/importance_threshold": -0.0002556096673750948, + "compression/movement_sparsity/linear_layer_sparsity": 0.8933443779743644, + "compression/movement_sparsity/model_sparsity": 0.862655250342186, + "compression_loss": 151.86865234375, + "distillation_loss": 7.150974273681641, + "epoch": 1.62, + "learning_rate": 3.843738678903514e-05, + "loss": 158.3788, + "step": 1915, + "task_loss": 3.0702965259552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4165451739346633, + "compression/movement_sparsity/importance_threshold": -0.00025392062219081746, + "compression/movement_sparsity/linear_layer_sparsity": 0.8936192896592092, + "compression/movement_sparsity/model_sparsity": 0.8629207179649292, + "compression_loss": 151.92684936523438, + "distillation_loss": 7.095952033996582, + "epoch": 1.62, + "learning_rate": 3.843134887090931e-05, + "loss": 157.9278, + "step": 1916, + "task_loss": 3.2225875854492188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.417097853100519, + "compression/movement_sparsity/importance_threshold": -0.0002522390341475316, + "compression/movement_sparsity/linear_layer_sparsity": 0.8938943802065685, + "compression/movement_sparsity/model_sparsity": 0.8631863583057093, + "compression_loss": 151.98477172851562, + "distillation_loss": 6.597002029418945, + "epoch": 1.62, + "learning_rate": 3.8425310952783486e-05, + "loss": 158.3976, + "step": 1917, + "task_loss": 3.2087225914001465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4176480867873387, + "compression/movement_sparsity/importance_threshold": -0.0002505648867471375, + "compression/movement_sparsity/linear_layer_sparsity": 0.8940815180934458, + "compression/movement_sparsity/model_sparsity": 0.8633670674304688, + "compression_loss": 152.04229736328125, + "distillation_loss": 7.084159851074219, + "epoch": 1.62, + "learning_rate": 3.8419273034657653e-05, + "loss": 158.5595, + "step": 1918, + "task_loss": 3.503180503845215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4181958804174704, + "compression/movement_sparsity/importance_threshold": -0.0002488981634915363, + "compression/movement_sparsity/linear_layer_sparsity": 0.8944814904484556, + "compression/movement_sparsity/model_sparsity": 0.8637532995046251, + "compression_loss": 152.09979248046875, + "distillation_loss": 7.364809036254883, + "epoch": 1.62, + "learning_rate": 3.841323511653182e-05, + "loss": 159.6463, + "step": 1919, + "task_loss": 2.637944221496582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4187412394132628, + "compression/movement_sparsity/importance_threshold": -0.00024723884788262814, + "compression/movement_sparsity/linear_layer_sparsity": 0.894842733106984, + "compression/movement_sparsity/model_sparsity": 0.8641021323665207, + "compression_loss": 152.15696716308594, + "distillation_loss": 5.917118549346924, + "epoch": 1.62, + "learning_rate": 3.8407197198405995e-05, + "loss": 158.3999, + "step": 1920, + "task_loss": 2.311518430709839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4192841691970637, + "compression/movement_sparsity/importance_threshold": -0.00024558692342231334, + "compression/movement_sparsity/linear_layer_sparsity": 0.895067145941891, + "compression/movement_sparsity/model_sparsity": 0.864318835930174, + "compression_loss": 152.2137908935547, + "distillation_loss": 5.295403003692627, + "epoch": 1.62, + "learning_rate": 3.840115928028016e-05, + "loss": 158.3144, + "step": 1921, + "task_loss": 3.3356878757476807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4198246751912214, + "compression/movement_sparsity/importance_threshold": -0.0002439423736124921, + "compression/movement_sparsity/linear_layer_sparsity": 0.8952257492956156, + "compression/movement_sparsity/model_sparsity": 0.8644719907707772, + "compression_loss": 152.27037048339844, + "distillation_loss": 6.522974014282227, + "epoch": 1.62, + "learning_rate": 3.839512136215433e-05, + "loss": 158.8991, + "step": 1922, + "task_loss": 2.797461748123169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4203627628180842, + "compression/movement_sparsity/importance_threshold": -0.00024230518195506555, + "compression/movement_sparsity/linear_layer_sparsity": 0.8956119492370059, + "compression/movement_sparsity/model_sparsity": 0.8648449235560908, + "compression_loss": 152.32669067382812, + "distillation_loss": 7.050638198852539, + "epoch": 1.63, + "learning_rate": 3.83890834440285e-05, + "loss": 158.3056, + "step": 1923, + "task_loss": 3.2105298042297363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4208984375, + "compression/movement_sparsity/importance_threshold": -0.0002406753319519339, + "compression/movement_sparsity/linear_layer_sparsity": 0.895969829280261, + "compression/movement_sparsity/model_sparsity": 0.8651905093188923, + "compression_loss": 152.38287353515625, + "distillation_loss": 7.551097869873047, + "epoch": 1.63, + "learning_rate": 3.838304552590267e-05, + "loss": 159.6248, + "step": 1924, + "task_loss": 3.4849605560302734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4214317046593172, + "compression/movement_sparsity/importance_threshold": -0.00023905280710499743, + "compression/movement_sparsity/linear_layer_sparsity": 0.8963923979329408, + "compression/movement_sparsity/model_sparsity": 0.8655985614383792, + "compression_loss": 152.43861389160156, + "distillation_loss": 6.809576034545898, + "epoch": 1.63, + "learning_rate": 3.837700760777684e-05, + "loss": 159.0645, + "step": 1925, + "task_loss": 3.5584347248077393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4219625697183842, + "compression/movement_sparsity/importance_threshold": -0.00023743759091615722, + "compression/movement_sparsity/linear_layer_sparsity": 0.8966419230648889, + "compression/movement_sparsity/model_sparsity": 0.8658395146144158, + "compression_loss": 152.49424743652344, + "distillation_loss": 7.455321788787842, + "epoch": 1.63, + "learning_rate": 3.837096968965101e-05, + "loss": 159.3402, + "step": 1926, + "task_loss": 3.625943422317505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4224910380995488, + "compression/movement_sparsity/importance_threshold": -0.00023582966688731177, + "compression/movement_sparsity/linear_layer_sparsity": 0.8968738242770711, + "compression/movement_sparsity/model_sparsity": 0.866063449306548, + "compression_loss": 152.5496063232422, + "distillation_loss": 6.751021862030029, + "epoch": 1.63, + "learning_rate": 3.836493177152518e-05, + "loss": 158.7307, + "step": 1927, + "task_loss": 2.8171238899230957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4230171152251594, + "compression/movement_sparsity/importance_threshold": -0.00023422901852036392, + "compression/movement_sparsity/linear_layer_sparsity": 0.8971317917197054, + "compression/movement_sparsity/model_sparsity": 0.8663125547739271, + "compression_loss": 152.60470581054688, + "distillation_loss": 4.98211145401001, + "epoch": 1.63, + "learning_rate": 3.835889385339935e-05, + "loss": 158.6095, + "step": 1928, + "task_loss": 3.651984214782715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4235408065175643, + "compression/movement_sparsity/importance_threshold": -0.00023263562931721306, + "compression/movement_sparsity/linear_layer_sparsity": 0.8974040681635027, + "compression/movement_sparsity/model_sparsity": 0.8665754776842597, + "compression_loss": 152.65965270996094, + "distillation_loss": 6.469775676727295, + "epoch": 1.63, + "learning_rate": 3.835285593527352e-05, + "loss": 159.7873, + "step": 1929, + "task_loss": 2.6886167526245117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4240621173991113, + "compression/movement_sparsity/importance_threshold": -0.00023104948277976028, + "compression/movement_sparsity/linear_layer_sparsity": 0.8976850969463447, + "compression/movement_sparsity/model_sparsity": 0.8668468522638654, + "compression_loss": 152.71426391601562, + "distillation_loss": 6.927318096160889, + "epoch": 1.63, + "learning_rate": 3.834681801714769e-05, + "loss": 158.5927, + "step": 1930, + "task_loss": 3.365535259246826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.424581053292149, + "compression/movement_sparsity/importance_threshold": -0.00022947056240990495, + "compression/movement_sparsity/linear_layer_sparsity": 0.8979098794304483, + "compression/movement_sparsity/model_sparsity": 0.8670639127781283, + "compression_loss": 152.7686309814453, + "distillation_loss": 5.36340856552124, + "epoch": 1.63, + "learning_rate": 3.834078009902186e-05, + "loss": 158.8465, + "step": 1931, + "task_loss": 3.8297770023345947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4250976196190255, + "compression/movement_sparsity/importance_threshold": -0.0002278988517095482, + "compression/movement_sparsity/linear_layer_sparsity": 0.8982550960076742, + "compression/movement_sparsity/model_sparsity": 0.867397270103916, + "compression_loss": 152.82275390625, + "distillation_loss": 7.523814678192139, + "epoch": 1.63, + "learning_rate": 3.833474218089603e-05, + "loss": 159.0863, + "step": 1932, + "task_loss": 3.859551191329956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.425611821802089, + "compression/movement_sparsity/importance_threshold": -0.00022633433418059023, + "compression/movement_sparsity/linear_layer_sparsity": 0.8984958449522423, + "compression/movement_sparsity/model_sparsity": 0.8676297485816078, + "compression_loss": 152.87664794921875, + "distillation_loss": 7.889961242675781, + "epoch": 1.63, + "learning_rate": 3.83287042627702e-05, + "loss": 159.1737, + "step": 1933, + "task_loss": 3.9454383850097656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4261236652636875, + "compression/movement_sparsity/importance_threshold": -0.00022477699332493217, + "compression/movement_sparsity/linear_layer_sparsity": 0.8987760748158526, + "compression/movement_sparsity/model_sparsity": 0.8679003516873153, + "compression_loss": 152.93026733398438, + "distillation_loss": 7.234272003173828, + "epoch": 1.63, + "learning_rate": 3.832266634464437e-05, + "loss": 159.5948, + "step": 1934, + "task_loss": 3.3416173458099365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4266331554261695, + "compression/movement_sparsity/importance_threshold": -0.0002232268126444734, + "compression/movement_sparsity/linear_layer_sparsity": 0.8990791871571563, + "compression/movement_sparsity/model_sparsity": 0.8681930511872126, + "compression_loss": 152.98362731933594, + "distillation_loss": 7.496713638305664, + "epoch": 1.64, + "learning_rate": 3.8316628426518536e-05, + "loss": 160.0006, + "step": 1935, + "task_loss": 4.045901775360107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4271402977118828, + "compression/movement_sparsity/importance_threshold": -0.00022168377564111587, + "compression/movement_sparsity/linear_layer_sparsity": 0.8992916996727627, + "compression/movement_sparsity/model_sparsity": 0.8683982632441428, + "compression_loss": 153.0367889404297, + "distillation_loss": 7.040009498596191, + "epoch": 1.64, + "learning_rate": 3.831059050839271e-05, + "loss": 159.8056, + "step": 1936, + "task_loss": 3.3511879444122314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.427645097543176, + "compression/movement_sparsity/importance_threshold": -0.0002201478658167581, + "compression/movement_sparsity/linear_layer_sparsity": 0.8995876694376524, + "compression/movement_sparsity/model_sparsity": 0.8686840655370992, + "compression_loss": 153.0896759033203, + "distillation_loss": 7.740026473999023, + "epoch": 1.64, + "learning_rate": 3.830455259026688e-05, + "loss": 159.3719, + "step": 1937, + "task_loss": 3.3991854190826416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.428147560342397, + "compression/movement_sparsity/importance_threshold": -0.00021861906667330292, + "compression/movement_sparsity/linear_layer_sparsity": 0.8998162557312319, + "compression/movement_sparsity/model_sparsity": 0.8689047991882805, + "compression_loss": 153.14227294921875, + "distillation_loss": 7.398360252380371, + "epoch": 1.64, + "learning_rate": 3.829851467214105e-05, + "loss": 159.4896, + "step": 1938, + "task_loss": 2.395493984222412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4286476915318942, + "compression/movement_sparsity/importance_threshold": -0.00021709736171264798, + "compression/movement_sparsity/linear_layer_sparsity": 0.9000546317664405, + "compression/movement_sparsity/model_sparsity": 0.8691349862733492, + "compression_loss": 153.19480895996094, + "distillation_loss": 6.806671619415283, + "epoch": 1.64, + "learning_rate": 3.829247675401522e-05, + "loss": 160.3127, + "step": 1939, + "task_loss": 2.657777786254883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4291454965340158, + "compression/movement_sparsity/importance_threshold": -0.0002155827344366961, + "compression/movement_sparsity/linear_layer_sparsity": 0.9003250122675837, + "compression/movement_sparsity/model_sparsity": 0.8693960783724903, + "compression_loss": 153.2468719482422, + "distillation_loss": 6.341867446899414, + "epoch": 1.64, + "learning_rate": 3.8286438835889386e-05, + "loss": 158.6906, + "step": 1940, + "task_loss": 3.045822858810425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.42964098077111, + "compression/movement_sparsity/importance_threshold": -0.00021407516834734668, + "compression/movement_sparsity/linear_layer_sparsity": 0.9006473940637869, + "compression/movement_sparsity/model_sparsity": 0.8697073853622316, + "compression_loss": 153.2987060546875, + "distillation_loss": 5.51707649230957, + "epoch": 1.64, + "learning_rate": 3.828040091776356e-05, + "loss": 159.9611, + "step": 1941, + "task_loss": 2.703279972076416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4301341496655244, + "compression/movement_sparsity/importance_threshold": -0.0002125746469465008, + "compression/movement_sparsity/linear_layer_sparsity": 0.9010875508637295, + "compression/movement_sparsity/model_sparsity": 0.8701324214220154, + "compression_loss": 153.35035705566406, + "distillation_loss": 6.027688980102539, + "epoch": 1.64, + "learning_rate": 3.827436299963773e-05, + "loss": 159.7969, + "step": 1942, + "task_loss": 2.8079710006713867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4306250086396082, + "compression/movement_sparsity/importance_threshold": -0.000211081153736057, + "compression/movement_sparsity/linear_layer_sparsity": 0.9012942920822, + "compression/movement_sparsity/model_sparsity": 0.8703320604436212, + "compression_loss": 153.4016876220703, + "distillation_loss": 8.256545066833496, + "epoch": 1.64, + "learning_rate": 3.8268325081511894e-05, + "loss": 160.3008, + "step": 1943, + "task_loss": 3.6278116703033447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4311135631157088, + "compression/movement_sparsity/importance_threshold": -0.00020959467221791893, + "compression/movement_sparsity/linear_layer_sparsity": 0.9014992446755252, + "compression/movement_sparsity/model_sparsity": 0.8705299722848577, + "compression_loss": 153.4528350830078, + "distillation_loss": 6.8891825675964355, + "epoch": 1.64, + "learning_rate": 3.826228716338607e-05, + "loss": 160.1271, + "step": 1944, + "task_loss": 3.0895731449127197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.431599818516175, + "compression/movement_sparsity/importance_threshold": -0.00020811518589398427, + "compression/movement_sparsity/linear_layer_sparsity": 0.9017244922021668, + "compression/movement_sparsity/model_sparsity": 0.8707474818660167, + "compression_loss": 153.5037841796875, + "distillation_loss": 6.901221752166748, + "epoch": 1.64, + "learning_rate": 3.8256249245260235e-05, + "loss": 159.4664, + "step": 1945, + "task_loss": 3.3441033363342285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4320837802633544, + "compression/movement_sparsity/importance_threshold": -0.00020664267826615498, + "compression/movement_sparsity/linear_layer_sparsity": 0.9019950754141597, + "compression/movement_sparsity/model_sparsity": 0.8710087697122664, + "compression_loss": 153.5543975830078, + "distillation_loss": 5.953382968902588, + "epoch": 1.64, + "learning_rate": 3.82502113271344e-05, + "loss": 160.2562, + "step": 1946, + "task_loss": 4.020930290222168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4325654537795955, + "compression/movement_sparsity/importance_threshold": -0.00020517713283633218, + "compression/movement_sparsity/linear_layer_sparsity": 0.9022372433346765, + "compression/movement_sparsity/model_sparsity": 0.8712426184197177, + "compression_loss": 153.60484313964844, + "distillation_loss": 7.471131324768066, + "epoch": 1.65, + "learning_rate": 3.8244173409008576e-05, + "loss": 160.126, + "step": 1947, + "task_loss": 3.3768246173858643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4330448444872466, + "compression/movement_sparsity/importance_threshold": -0.00020371853310641348, + "compression/movement_sparsity/linear_layer_sparsity": 0.9024447596240434, + "compression/movement_sparsity/model_sparsity": 0.8714430058861501, + "compression_loss": 153.65501403808594, + "distillation_loss": 7.707447052001953, + "epoch": 1.65, + "learning_rate": 3.8238135490882743e-05, + "loss": 160.0766, + "step": 1948, + "task_loss": 4.069027900695801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4335219578086558, + "compression/movement_sparsity/importance_threshold": -0.00020226686257830175, + "compression/movement_sparsity/linear_layer_sparsity": 0.9027659728518183, + "compression/movement_sparsity/model_sparsity": 0.8717531844513834, + "compression_loss": 153.70481872558594, + "distillation_loss": 8.208759307861328, + "epoch": 1.65, + "learning_rate": 3.823209757275692e-05, + "loss": 161.3708, + "step": 1949, + "task_loss": 3.846348524093628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.433996799166171, + "compression/movement_sparsity/importance_threshold": -0.0002008221047538972, + "compression/movement_sparsity/linear_layer_sparsity": 0.9030649355827847, + "compression/movement_sparsity/model_sparsity": 0.8720418768928242, + "compression_loss": 153.75440979003906, + "distillation_loss": 9.980860710144043, + "epoch": 1.65, + "learning_rate": 3.8226059654631085e-05, + "loss": 161.0352, + "step": 1950, + "task_loss": 4.253302097320557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.434469373982141, + "compression/movement_sparsity/importance_threshold": -0.00019938424313509923, + "compression/movement_sparsity/linear_layer_sparsity": 0.9032926871846296, + "compression/movement_sparsity/model_sparsity": 0.8722618045264999, + "compression_loss": 153.80381774902344, + "distillation_loss": 6.548616409301758, + "epoch": 1.65, + "learning_rate": 3.822002173650526e-05, + "loss": 160.1709, + "step": 1951, + "task_loss": 2.5666110515594482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4349396876789133, + "compression/movement_sparsity/importance_threshold": -0.0001979532612238098, + "compression/movement_sparsity/linear_layer_sparsity": 0.9035432020224915, + "compression/movement_sparsity/model_sparsity": 0.8725037134090075, + "compression_loss": 153.85301208496094, + "distillation_loss": 8.077263832092285, + "epoch": 1.65, + "learning_rate": 3.8213983818379426e-05, + "loss": 160.1771, + "step": 1952, + "task_loss": 2.981919527053833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4354077456788368, + "compression/movement_sparsity/importance_threshold": -0.00019652914252192826, + "compression/movement_sparsity/linear_layer_sparsity": 0.9037222553237115, + "compression/movement_sparsity/model_sparsity": 0.8726766156784982, + "compression_loss": 153.90185546875, + "distillation_loss": 7.715006351470947, + "epoch": 1.65, + "learning_rate": 3.820794590025359e-05, + "loss": 160.3011, + "step": 1953, + "task_loss": 3.968613386154175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.435873553404259, + "compression/movement_sparsity/importance_threshold": -0.00019511187053135575, + "compression/movement_sparsity/linear_layer_sparsity": 0.9040080180011051, + "compression/movement_sparsity/model_sparsity": 0.8729525615288144, + "compression_loss": 153.9506072998047, + "distillation_loss": 6.49777889251709, + "epoch": 1.65, + "learning_rate": 3.820190798212777e-05, + "loss": 160.8285, + "step": 1954, + "task_loss": 2.9812023639678955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.436337116277529, + "compression/movement_sparsity/importance_threshold": -0.00019370142875399076, + "compression/movement_sparsity/linear_layer_sparsity": 0.9043232214483914, + "compression/movement_sparsity/model_sparsity": 0.8732569367680073, + "compression_loss": 153.9990234375, + "distillation_loss": 6.795435905456543, + "epoch": 1.65, + "learning_rate": 3.8195870064001934e-05, + "loss": 160.6026, + "step": 1955, + "task_loss": 3.6901659965515137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4367984397209939, + "compression/movement_sparsity/importance_threshold": -0.000192297800691737, + "compression/movement_sparsity/linear_layer_sparsity": 0.9045080818192502, + "compression/movement_sparsity/model_sparsity": 0.87343544661643, + "compression_loss": 154.04721069335938, + "distillation_loss": 9.469181060791016, + "epoch": 1.65, + "learning_rate": 3.81898321458761e-05, + "loss": 160.9894, + "step": 1956, + "task_loss": 4.162032604217529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4372575291570024, + "compression/movement_sparsity/importance_threshold": -0.00019090096984649297, + "compression/movement_sparsity/linear_layer_sparsity": 0.9046436357569347, + "compression/movement_sparsity/model_sparsity": 0.8735663438593424, + "compression_loss": 154.0951385498047, + "distillation_loss": 5.460579872131348, + "epoch": 1.65, + "learning_rate": 3.8183794227750275e-05, + "loss": 159.9049, + "step": 1957, + "task_loss": 2.5839810371398926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.437714390007903, + "compression/movement_sparsity/importance_threshold": -0.0001895109197201589, + "compression/movement_sparsity/linear_layer_sparsity": 0.9049524120778654, + "compression/movement_sparsity/model_sparsity": 0.873864512763742, + "compression_loss": 154.1427459716797, + "distillation_loss": 6.584769248962402, + "epoch": 1.65, + "learning_rate": 3.817775630962444e-05, + "loss": 160.8876, + "step": 1958, + "task_loss": 3.030895233154297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4381690276960435, + "compression/movement_sparsity/importance_threshold": -0.00018812763381463593, + "compression/movement_sparsity/linear_layer_sparsity": 0.9052091751795684, + "compression/movement_sparsity/model_sparsity": 0.8741124552630058, + "compression_loss": 154.19012451171875, + "distillation_loss": 7.80364465713501, + "epoch": 1.66, + "learning_rate": 3.817171839149861e-05, + "loss": 161.1926, + "step": 1959, + "task_loss": 3.5214920043945312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4386214476437722, + "compression/movement_sparsity/importance_threshold": -0.00018675109563182513, + "compression/movement_sparsity/linear_layer_sparsity": 0.9055465933511604, + "compression/movement_sparsity/model_sparsity": 0.8744382820823839, + "compression_loss": 154.2372589111328, + "distillation_loss": 9.102875709533691, + "epoch": 1.66, + "learning_rate": 3.8165680473372784e-05, + "loss": 161.7429, + "step": 1960, + "task_loss": 4.296591758728027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4390716552734375, + "compression/movement_sparsity/importance_threshold": -0.00018538128867362502, + "compression/movement_sparsity/linear_layer_sparsity": 0.905795510350559, + "compression/movement_sparsity/model_sparsity": 0.874678648017095, + "compression_loss": 154.28424072265625, + "distillation_loss": 6.282147407531738, + "epoch": 1.66, + "learning_rate": 3.815964255524696e-05, + "loss": 161.0876, + "step": 1961, + "task_loss": 2.305163860321045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4395196560073873, + "compression/movement_sparsity/importance_threshold": -0.00018401819644193757, + "compression/movement_sparsity/linear_layer_sparsity": 0.9059984239112185, + "compression/movement_sparsity/model_sparsity": 0.8748745908727107, + "compression_loss": 154.3309783935547, + "distillation_loss": 6.44364595413208, + "epoch": 1.66, + "learning_rate": 3.815360463712112e-05, + "loss": 160.4177, + "step": 1962, + "task_loss": 2.2141098976135254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.43996545526797, + "compression/movement_sparsity/importance_threshold": -0.00018266180243866302, + "compression/movement_sparsity/linear_layer_sparsity": 0.9061760820848253, + "compression/movement_sparsity/model_sparsity": 0.8750461459415136, + "compression_loss": 154.37742614746094, + "distillation_loss": 5.939295768737793, + "epoch": 1.66, + "learning_rate": 3.814756671899529e-05, + "loss": 160.5975, + "step": 1963, + "task_loss": 2.339681625366211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4404090584775335, + "compression/movement_sparsity/importance_threshold": -0.0001813120901657016, + "compression/movement_sparsity/linear_layer_sparsity": 0.9064238662882985, + "compression/movement_sparsity/model_sparsity": 0.8752854179953241, + "compression_loss": 154.42367553710938, + "distillation_loss": 7.142485618591309, + "epoch": 1.66, + "learning_rate": 3.8141528800869466e-05, + "loss": 160.7727, + "step": 1964, + "task_loss": 3.8312253952026367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4408504710584262, + "compression/movement_sparsity/importance_threshold": -0.0001799690431249553, + "compression/movement_sparsity/linear_layer_sparsity": 0.9066003916659798, + "compression/movement_sparsity/model_sparsity": 0.8754558791832265, + "compression_loss": 154.4696502685547, + "distillation_loss": 7.039237976074219, + "epoch": 1.66, + "learning_rate": 3.813549088274363e-05, + "loss": 161.4191, + "step": 1965, + "task_loss": 3.5504560470581055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4412896984329964, + "compression/movement_sparsity/importance_threshold": -0.00017863264481832088, + "compression/movement_sparsity/linear_layer_sparsity": 0.9068281790403278, + "compression/movement_sparsity/model_sparsity": 0.8756758413605097, + "compression_loss": 154.51553344726562, + "distillation_loss": 6.728089332580566, + "epoch": 1.66, + "learning_rate": 3.81294529646178e-05, + "loss": 161.4745, + "step": 1966, + "task_loss": 3.701183795928955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4417267460235919, + "compression/movement_sparsity/importance_threshold": -0.00017730287874770292, + "compression/movement_sparsity/linear_layer_sparsity": 0.9069930902787318, + "compression/movement_sparsity/model_sparsity": 0.8758350873905482, + "compression_loss": 154.56114196777344, + "distillation_loss": 7.8387908935546875, + "epoch": 1.66, + "learning_rate": 3.8123415046491974e-05, + "loss": 160.938, + "step": 1967, + "task_loss": 4.294440269470215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4421616192525615, + "compression/movement_sparsity/importance_threshold": -0.0001759797284149999, + "compression/movement_sparsity/linear_layer_sparsity": 0.9072113740914738, + "compression/movement_sparsity/model_sparsity": 0.876045872482803, + "compression_loss": 154.60650634765625, + "distillation_loss": 5.65949010848999, + "epoch": 1.66, + "learning_rate": 3.811737712836614e-05, + "loss": 161.1363, + "step": 1968, + "task_loss": 2.6471495628356934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.442594323542253, + "compression/movement_sparsity/importance_threshold": -0.00017466317732211124, + "compression/movement_sparsity/linear_layer_sparsity": 0.9073294114269013, + "compression/movement_sparsity/model_sparsity": 0.8761598548726331, + "compression_loss": 154.65162658691406, + "distillation_loss": 6.765289306640625, + "epoch": 1.66, + "learning_rate": 3.811133921024031e-05, + "loss": 160.5359, + "step": 1969, + "task_loss": 3.3650200366973877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4430248643150148, + "compression/movement_sparsity/importance_threshold": -0.00017335320897093887, + "compression/movement_sparsity/linear_layer_sparsity": 0.9074776049822797, + "compression/movement_sparsity/model_sparsity": 0.8763029575234877, + "compression_loss": 154.6966094970703, + "distillation_loss": 7.206822395324707, + "epoch": 1.66, + "learning_rate": 3.810530129211448e-05, + "loss": 160.9106, + "step": 1970, + "task_loss": 3.7178566455841064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4434532469931947, + "compression/movement_sparsity/importance_threshold": -0.0001720498068633839, + "compression/movement_sparsity/linear_layer_sparsity": 0.9076354094167728, + "compression/movement_sparsity/model_sparsity": 0.8764553408901927, + "compression_loss": 154.7411651611328, + "distillation_loss": 6.287899494171143, + "epoch": 1.67, + "learning_rate": 3.809926337398865e-05, + "loss": 161.1329, + "step": 1971, + "task_loss": 3.068429470062256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4438794769991412, + "compression/movement_sparsity/importance_threshold": -0.00017075295450134486, + "compression/movement_sparsity/linear_layer_sparsity": 0.9077381122726205, + "compression/movement_sparsity/model_sparsity": 0.876554515586991, + "compression_loss": 154.78558349609375, + "distillation_loss": 6.153408050537109, + "epoch": 1.67, + "learning_rate": 3.809322545586282e-05, + "loss": 161.2242, + "step": 1972, + "task_loss": 2.855734348297119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4443035597552023, + "compression/movement_sparsity/importance_threshold": -0.00016946263538672457, + "compression/movement_sparsity/linear_layer_sparsity": 0.9078229765736848, + "compression/movement_sparsity/model_sparsity": 0.8766364645382408, + "compression_loss": 154.82969665527344, + "distillation_loss": 6.8665056228637695, + "epoch": 1.67, + "learning_rate": 3.808718753773699e-05, + "loss": 161.7839, + "step": 1973, + "task_loss": 4.93058967590332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4447255006837267, + "compression/movement_sparsity/importance_threshold": -0.00016817883302142154, + "compression/movement_sparsity/linear_layer_sparsity": 0.9080460062051461, + "compression/movement_sparsity/model_sparsity": 0.8768518324157419, + "compression_loss": 154.87353515625, + "distillation_loss": 7.2077226638793945, + "epoch": 1.67, + "learning_rate": 3.8081149619611165e-05, + "loss": 161.2269, + "step": 1974, + "task_loss": 3.149751901626587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4451453052070622, + "compression/movement_sparsity/importance_threshold": -0.000166901530907336, + "compression/movement_sparsity/linear_layer_sparsity": 0.9082528070444548, + "compression/movement_sparsity/model_sparsity": 0.8770515290100266, + "compression_loss": 154.9173583984375, + "distillation_loss": 6.691270351409912, + "epoch": 1.67, + "learning_rate": 3.8075111701485325e-05, + "loss": 160.9845, + "step": 1975, + "task_loss": 3.401458501815796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.445562978747557, + "compression/movement_sparsity/importance_threshold": -0.00016563071254636994, + "compression/movement_sparsity/linear_layer_sparsity": 0.9083947761843274, + "compression/movement_sparsity/model_sparsity": 0.8771886210731964, + "compression_loss": 154.96072387695312, + "distillation_loss": 8.148401260375977, + "epoch": 1.67, + "learning_rate": 3.80690737833595e-05, + "loss": 161.3526, + "step": 1976, + "task_loss": 4.160513877868652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.445978526727559, + "compression/movement_sparsity/importance_threshold": -0.0001643663614404236, + "compression/movement_sparsity/linear_layer_sparsity": 0.9086244475771618, + "compression/movement_sparsity/model_sparsity": 0.8774104025471351, + "compression_loss": 155.00384521484375, + "distillation_loss": 7.9408063888549805, + "epoch": 1.67, + "learning_rate": 3.806303586523367e-05, + "loss": 161.2826, + "step": 1977, + "task_loss": 3.8156015872955322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.446391954569417, + "compression/movement_sparsity/importance_threshold": -0.00016310846109139632, + "compression/movement_sparsity/linear_layer_sparsity": 0.9089127977989322, + "compression/movement_sparsity/model_sparsity": 0.8776888470517187, + "compression_loss": 155.04678344726562, + "distillation_loss": 6.890384197235107, + "epoch": 1.67, + "learning_rate": 3.8056997947107834e-05, + "loss": 162.2542, + "step": 1978, + "task_loss": 3.1062464714050293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4468032676954792, + "compression/movement_sparsity/importance_threshold": -0.00016185699500118923, + "compression/movement_sparsity/linear_layer_sparsity": 0.9091257515087411, + "compression/movement_sparsity/model_sparsity": 0.8778944851464734, + "compression_loss": 155.0895233154297, + "distillation_loss": 6.170140266418457, + "epoch": 1.67, + "learning_rate": 3.805096002898201e-05, + "loss": 160.7493, + "step": 1979, + "task_loss": 3.550039291381836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.447212471528093, + "compression/movement_sparsity/importance_threshold": -0.00016061194667170257, + "compression/movement_sparsity/linear_layer_sparsity": 0.9093343528973629, + "compression/movement_sparsity/model_sparsity": 0.8780959204356631, + "compression_loss": 155.13204956054688, + "distillation_loss": 6.297365665435791, + "epoch": 1.67, + "learning_rate": 3.804492211085618e-05, + "loss": 161.6006, + "step": 1980, + "task_loss": 2.8500585556030273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4476195714896074, + "compression/movement_sparsity/importance_threshold": -0.00015937329960483745, + "compression/movement_sparsity/linear_layer_sparsity": 0.9094703838017528, + "compression/movement_sparsity/model_sparsity": 0.8782272782600071, + "compression_loss": 155.1742401123047, + "distillation_loss": 6.63515043258667, + "epoch": 1.67, + "learning_rate": 3.803888419273035e-05, + "loss": 161.8307, + "step": 1981, + "task_loss": 3.8548760414123535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4480245730023702, + "compression/movement_sparsity/importance_threshold": -0.0001581410373024941, + "compression/movement_sparsity/linear_layer_sparsity": 0.9097322265988688, + "compression/movement_sparsity/model_sparsity": 0.8784801259515195, + "compression_loss": 155.21630859375, + "distillation_loss": 7.911506652832031, + "epoch": 1.67, + "learning_rate": 3.8032846274604516e-05, + "loss": 161.9871, + "step": 1982, + "task_loss": 3.444103717803955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.44842748148873, + "compression/movement_sparsity/importance_threshold": -0.00015691514326657188, + "compression/movement_sparsity/linear_layer_sparsity": 0.9098883258773898, + "compression/movement_sparsity/model_sparsity": 0.8786308627396058, + "compression_loss": 155.2582244873047, + "distillation_loss": 7.340427875518799, + "epoch": 1.68, + "learning_rate": 3.802680835647869e-05, + "loss": 161.4076, + "step": 1983, + "task_loss": 2.403489828109741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4488283023710342, + "compression/movement_sparsity/importance_threshold": -0.00015569560099897278, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100581498728597, + "compression/movement_sparsity/model_sparsity": 0.8787948527583916, + "compression_loss": 155.29989624023438, + "distillation_loss": 5.912631034851074, + "epoch": 1.68, + "learning_rate": 3.802077043835286e-05, + "loss": 161.1805, + "step": 1984, + "task_loss": 3.3173036575317383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.449227041071632, + "compression/movement_sparsity/importance_threshold": -0.00015448239400159703, + "compression/movement_sparsity/linear_layer_sparsity": 0.9102638298404108, + "compression/movement_sparsity/model_sparsity": 0.8789934669863118, + "compression_loss": 155.3412628173828, + "distillation_loss": 7.287107467651367, + "epoch": 1.68, + "learning_rate": 3.8014732520227024e-05, + "loss": 161.688, + "step": 1985, + "task_loss": 4.141517162322998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4496237030128707, + "compression/movement_sparsity/importance_threshold": -0.000153275505776344, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104080049512959, + "compression/movement_sparsity/model_sparsity": 0.8791326892386035, + "compression_loss": 155.38246154785156, + "distillation_loss": 5.387873649597168, + "epoch": 1.68, + "learning_rate": 3.80086946021012e-05, + "loss": 161.1354, + "step": 1986, + "task_loss": 2.9147231578826904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.450018293617099, + "compression/movement_sparsity/importance_threshold": -0.00015207491982511482, + "compression/movement_sparsity/linear_layer_sparsity": 0.9106325966487174, + "compression/movement_sparsity/model_sparsity": 0.8793495655202936, + "compression_loss": 155.42344665527344, + "distillation_loss": 5.369961261749268, + "epoch": 1.68, + "learning_rate": 3.8002656683975365e-05, + "loss": 162.2061, + "step": 1987, + "task_loss": 3.8838868141174316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4504108183066649, + "compression/movement_sparsity/importance_threshold": -0.0001508806196498097, + "compression/movement_sparsity/linear_layer_sparsity": 0.910857546071168, + "compression/movement_sparsity/model_sparsity": 0.8795667872380577, + "compression_loss": 155.46417236328125, + "distillation_loss": 5.209074974060059, + "epoch": 1.68, + "learning_rate": 3.799661876584953e-05, + "loss": 161.5251, + "step": 1988, + "task_loss": 2.7505199909210205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.450801282503917, + "compression/movement_sparsity/importance_threshold": -0.00014969258875232976, + "compression/movement_sparsity/linear_layer_sparsity": 0.9110679122365999, + "compression/movement_sparsity/model_sparsity": 0.879769926678545, + "compression_loss": 155.50448608398438, + "distillation_loss": 8.447084426879883, + "epoch": 1.68, + "learning_rate": 3.7990580847723706e-05, + "loss": 161.8272, + "step": 1989, + "task_loss": 4.0983171463012695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4511896916312026, + "compression/movement_sparsity/importance_threshold": -0.0001485108106345761, + "compression/movement_sparsity/linear_layer_sparsity": 0.9113143967058008, + "compression/movement_sparsity/model_sparsity": 0.880007943647954, + "compression_loss": 155.5447998046875, + "distillation_loss": 6.900386810302734, + "epoch": 1.68, + "learning_rate": 3.798454292959788e-05, + "loss": 162.2076, + "step": 1990, + "task_loss": 4.0427422523498535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4515760511108708, + "compression/movement_sparsity/importance_threshold": -0.00014733526879844722, + "compression/movement_sparsity/linear_layer_sparsity": 0.9114136892497047, + "compression/movement_sparsity/model_sparsity": 0.8801038251875151, + "compression_loss": 155.58474731445312, + "distillation_loss": 7.763065338134766, + "epoch": 1.68, + "learning_rate": 3.797850501147205e-05, + "loss": 161.4554, + "step": 1991, + "task_loss": 3.791189193725586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4519603663652696, + "compression/movement_sparsity/importance_threshold": -0.00014616594674584336, + "compression/movement_sparsity/linear_layer_sparsity": 0.9116688187404416, + "compression/movement_sparsity/model_sparsity": 0.8803501901953751, + "compression_loss": 155.62451171875, + "distillation_loss": 6.4316630363464355, + "epoch": 1.68, + "learning_rate": 3.7972467093346215e-05, + "loss": 161.9414, + "step": 1992, + "task_loss": 2.49525785446167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4523426428167467, + "compression/movement_sparsity/importance_threshold": -0.00014500282797866824, + "compression/movement_sparsity/linear_layer_sparsity": 0.9119114159309932, + "compression/movement_sparsity/model_sparsity": 0.880584453426115, + "compression_loss": 155.66416931152344, + "distillation_loss": 5.468949317932129, + "epoch": 1.68, + "learning_rate": 3.796642917522039e-05, + "loss": 162.2815, + "step": 1993, + "task_loss": 2.1942763328552246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.452722885887651, + "compression/movement_sparsity/importance_threshold": -0.00014384589599881863, + "compression/movement_sparsity/linear_layer_sparsity": 0.912071354791493, + "compression/movement_sparsity/model_sparsity": 0.8807388978947271, + "compression_loss": 155.7034912109375, + "distillation_loss": 6.161131381988525, + "epoch": 1.69, + "learning_rate": 3.7960391257094556e-05, + "loss": 161.9526, + "step": 1994, + "task_loss": 2.6343321800231934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4531011010003303, + "compression/movement_sparsity/importance_threshold": -0.00014269513430819735, + "compression/movement_sparsity/linear_layer_sparsity": 0.912288267324957, + "compression/movement_sparsity/model_sparsity": 0.8809483588153657, + "compression_loss": 155.74256896972656, + "distillation_loss": 5.467075347900391, + "epoch": 1.69, + "learning_rate": 3.795435333896872e-05, + "loss": 161.9772, + "step": 1995, + "task_loss": 2.2041709423065186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4534772935771327, + "compression/movement_sparsity/importance_threshold": -0.0001415505264087038, + "compression/movement_sparsity/linear_layer_sparsity": 0.9124986692628918, + "compression/movement_sparsity/model_sparsity": 0.8811515327994603, + "compression_loss": 155.781494140625, + "distillation_loss": 6.376107215881348, + "epoch": 1.69, + "learning_rate": 3.79483154208429e-05, + "loss": 161.4582, + "step": 1996, + "task_loss": 3.3445024490356445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4538514690404065, + "compression/movement_sparsity/importance_threshold": -0.00014041205580223818, + "compression/movement_sparsity/linear_layer_sparsity": 0.9127566247813584, + "compression/movement_sparsity/model_sparsity": 0.8814006267523037, + "compression_loss": 155.82015991210938, + "distillation_loss": 5.690335273742676, + "epoch": 1.69, + "learning_rate": 3.7942277502717064e-05, + "loss": 161.511, + "step": 1997, + "task_loss": 2.5769665241241455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4542236328125, + "compression/movement_sparsity/importance_threshold": -0.0001392797059907025, + "compression/movement_sparsity/linear_layer_sparsity": 0.9129653573358243, + "compression/movement_sparsity/model_sparsity": 0.8816021887013871, + "compression_loss": 155.858642578125, + "distillation_loss": 5.2250237464904785, + "epoch": 1.69, + "learning_rate": 3.793623958459123e-05, + "loss": 162.0871, + "step": 1998, + "task_loss": 2.960421562194824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4545937903157613, + "compression/movement_sparsity/importance_threshold": -0.00013815346047599524, + "compression/movement_sparsity/linear_layer_sparsity": 0.9131690936640506, + "compression/movement_sparsity/model_sparsity": 0.8817989260599726, + "compression_loss": 155.8968048095703, + "distillation_loss": 6.497771263122559, + "epoch": 1.69, + "learning_rate": 3.7930201666465405e-05, + "loss": 162.3331, + "step": 1999, + "task_loss": 3.3142573833465576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4549619469725386, + "compression/movement_sparsity/importance_threshold": -0.00013703330276001926, + "compression/movement_sparsity/linear_layer_sparsity": 0.9133682868844077, + "compression/movement_sparsity/model_sparsity": 0.8819912763804204, + "compression_loss": 155.93484497070312, + "distillation_loss": 6.367579460144043, + "epoch": 1.69, + "learning_rate": 3.792416374833957e-05, + "loss": 162.4195, + "step": 2000, + "task_loss": 2.366534948348999 + }, + { + "epoch": 1.69, + "eval_accuracy": 0.3372673267326733, + "eval_loss": 162.77630615234375, + "eval_runtime": 310.0412, + "eval_samples_per_second": 81.441, + "eval_steps_per_second": 0.639, + "step": 2000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.45532810820518, + "compression/movement_sparsity/importance_threshold": -0.00013591921634467218, + "compression/movement_sparsity/linear_layer_sparsity": 0.913518257140764, + "compression/movement_sparsity/model_sparsity": 0.8821360946971083, + "compression_loss": 155.97264099121094, + "distillation_loss": 7.12130069732666, + "epoch": 1.69, + "learning_rate": 3.7918125830213746e-05, + "loss": 163.1065, + "step": 2001, + "task_loss": 3.3076558113098145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.455692279436034, + "compression/movement_sparsity/importance_threshold": -0.00013481118473185685, + "compression/movement_sparsity/linear_layer_sparsity": 0.9137163771860338, + "compression/movement_sparsity/model_sparsity": 0.8823274087093347, + "compression_loss": 156.01026916503906, + "distillation_loss": 7.194187641143799, + "epoch": 1.69, + "learning_rate": 3.7912087912087914e-05, + "loss": 162.1262, + "step": 2002, + "task_loss": 3.8227150440216064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4560544660874486, + "compression/movement_sparsity/importance_threshold": -0.00013370919142347264, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139006175001755, + "compression/movement_sparsity/model_sparsity": 0.8825053198018961, + "compression_loss": 156.047607421875, + "distillation_loss": 6.145810604095459, + "epoch": 1.69, + "learning_rate": 3.790604999396208e-05, + "loss": 162.3746, + "step": 2003, + "task_loss": 3.821897506713867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4564146735817722, + "compression/movement_sparsity/importance_threshold": -0.00013261321992141892, + "compression/movement_sparsity/linear_layer_sparsity": 0.9140687959605117, + "compression/movement_sparsity/model_sparsity": 0.8826677208147423, + "compression_loss": 156.08482360839844, + "distillation_loss": 5.829546928405762, + "epoch": 1.69, + "learning_rate": 3.7900012075836255e-05, + "loss": 162.0737, + "step": 2004, + "task_loss": 3.414478063583374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4567729073413525, + "compression/movement_sparsity/importance_threshold": -0.00013152325372759854, + "compression/movement_sparsity/linear_layer_sparsity": 0.9143393553241694, + "compression/movement_sparsity/model_sparsity": 0.8829289856319205, + "compression_loss": 156.12173461914062, + "distillation_loss": 8.78770637512207, + "epoch": 1.69, + "learning_rate": 3.789397415771042e-05, + "loss": 163.4461, + "step": 2005, + "task_loss": 4.179129123687744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4571291727885378, + "compression/movement_sparsity/importance_threshold": -0.00013043927634391085, + "compression/movement_sparsity/linear_layer_sparsity": 0.9144861656761022, + "compression/movement_sparsity/model_sparsity": 0.8830707525966228, + "compression_loss": 156.158447265625, + "distillation_loss": 8.001663208007812, + "epoch": 1.7, + "learning_rate": 3.7887936239584596e-05, + "loss": 163.169, + "step": 2006, + "task_loss": 4.638307094573975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.457483475345677, + "compression/movement_sparsity/importance_threshold": -0.0001293612712722561, + "compression/movement_sparsity/linear_layer_sparsity": 0.9146587322301284, + "compression/movement_sparsity/model_sparsity": 0.8832373909586414, + "compression_loss": 156.19508361816406, + "distillation_loss": 8.28236198425293, + "epoch": 1.7, + "learning_rate": 3.788189832145876e-05, + "loss": 162.8239, + "step": 2007, + "task_loss": 3.3667547702789307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4578358204351174, + "compression/movement_sparsity/importance_threshold": -0.00012828922201453454, + "compression/movement_sparsity/linear_layer_sparsity": 0.9149671746743653, + "compression/movement_sparsity/model_sparsity": 0.8835352374560388, + "compression_loss": 156.2313232421875, + "distillation_loss": 6.896763801574707, + "epoch": 1.7, + "learning_rate": 3.787586040333293e-05, + "loss": 162.4879, + "step": 2008, + "task_loss": 4.932947635650635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4581862134792076, + "compression/movement_sparsity/importance_threshold": -0.00012722311207264726, + "compression/movement_sparsity/linear_layer_sparsity": 0.9151260522839455, + "compression/movement_sparsity/model_sparsity": 0.8836886571309652, + "compression_loss": 156.26756286621094, + "distillation_loss": 5.693926811218262, + "epoch": 1.7, + "learning_rate": 3.7869822485207104e-05, + "loss": 162.8252, + "step": 2009, + "task_loss": 4.165630340576172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.458534659900296, + "compression/movement_sparsity/importance_threshold": -0.0001261629249484945, + "compression/movement_sparsity/linear_layer_sparsity": 0.9153460531768272, + "compression/movement_sparsity/model_sparsity": 0.8839011003163746, + "compression_loss": 156.30340576171875, + "distillation_loss": 5.526670932769775, + "epoch": 1.7, + "learning_rate": 3.786378456708127e-05, + "loss": 162.3079, + "step": 2010, + "task_loss": 3.1792430877685547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4588811651207303, + "compression/movement_sparsity/importance_threshold": -0.00012510864414397652, + "compression/movement_sparsity/linear_layer_sparsity": 0.9154999941810062, + "compression/movement_sparsity/model_sparsity": 0.8840497529734821, + "compression_loss": 156.33914184570312, + "distillation_loss": 6.163922309875488, + "epoch": 1.7, + "learning_rate": 3.7857746648955445e-05, + "loss": 162.1426, + "step": 2011, + "task_loss": 2.2184555530548096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.459225734562859, + "compression/movement_sparsity/importance_threshold": -0.00012406025316099353, + "compression/movement_sparsity/linear_layer_sparsity": 0.9157316330615005, + "compression/movement_sparsity/model_sparsity": 0.8842734343458268, + "compression_loss": 156.37466430664062, + "distillation_loss": 6.6913604736328125, + "epoch": 1.7, + "learning_rate": 3.785170873082961e-05, + "loss": 163.211, + "step": 2012, + "task_loss": 2.4694747924804688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4595683736490304, + "compression/movement_sparsity/importance_threshold": -0.00012301773550144664, + "compression/movement_sparsity/linear_layer_sparsity": 0.9159080272733378, + "compression/movement_sparsity/model_sparsity": 0.8844437688738355, + "compression_loss": 156.40988159179688, + "distillation_loss": 7.296276092529297, + "epoch": 1.7, + "learning_rate": 3.784567081270378e-05, + "loss": 162.1593, + "step": 2013, + "task_loss": 2.879027843475342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4599090878015926, + "compression/movement_sparsity/importance_threshold": -0.00012198107466723611, + "compression/movement_sparsity/linear_layer_sparsity": 0.9159969696397336, + "compression/movement_sparsity/model_sparsity": 0.8845296557963269, + "compression_loss": 156.44503784179688, + "distillation_loss": 6.6902947425842285, + "epoch": 1.7, + "learning_rate": 3.7839632894577954e-05, + "loss": 162.1855, + "step": 2014, + "task_loss": 2.551546335220337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4602478824428933, + "compression/movement_sparsity/importance_threshold": -0.00012095025416026303, + "compression/movement_sparsity/linear_layer_sparsity": 0.9162229803131038, + "compression/movement_sparsity/model_sparsity": 0.8847479023077767, + "compression_loss": 156.4798583984375, + "distillation_loss": 6.725427627563477, + "epoch": 1.7, + "learning_rate": 3.783359497645212e-05, + "loss": 162.7022, + "step": 2015, + "task_loss": 2.828561544418335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4605847629952815, + "compression/movement_sparsity/importance_threshold": -0.00011992525748242677, + "compression/movement_sparsity/linear_layer_sparsity": 0.9163561136447582, + "compression/movement_sparsity/model_sparsity": 0.8848764620999227, + "compression_loss": 156.5145721435547, + "distillation_loss": 7.439986228942871, + "epoch": 1.7, + "learning_rate": 3.782755705832629e-05, + "loss": 163.9552, + "step": 2016, + "task_loss": 4.732082843780518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4609197348811052, + "compression/movement_sparsity/importance_threshold": -0.00011890606813562757, + "compression/movement_sparsity/linear_layer_sparsity": 0.9165757091159403, + "compression/movement_sparsity/model_sparsity": 0.885088513791115, + "compression_loss": 156.54904174804688, + "distillation_loss": 6.835080623626709, + "epoch": 1.7, + "learning_rate": 3.782151914020046e-05, + "loss": 163.0522, + "step": 2017, + "task_loss": 4.46486234664917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4612528035227121, + "compression/movement_sparsity/importance_threshold": -0.00011789266962176741, + "compression/movement_sparsity/linear_layer_sparsity": 0.9167404772643326, + "compression/movement_sparsity/model_sparsity": 0.8852476216467239, + "compression_loss": 156.58331298828125, + "distillation_loss": 7.157663822174072, + "epoch": 1.71, + "learning_rate": 3.781548122207463e-05, + "loss": 163.2967, + "step": 2018, + "task_loss": 3.536332130432129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4615839743424508, + "compression/movement_sparsity/importance_threshold": -0.00011688504544274479, + "compression/movement_sparsity/linear_layer_sparsity": 0.9169146297326545, + "compression/movement_sparsity/model_sparsity": 0.8854157914420032, + "compression_loss": 156.61752319335938, + "distillation_loss": 7.78938627243042, + "epoch": 1.71, + "learning_rate": 3.7809443303948796e-05, + "loss": 163.1402, + "step": 2019, + "task_loss": 4.093336582183838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4619132527626695, + "compression/movement_sparsity/importance_threshold": -0.00011588317910046254, + "compression/movement_sparsity/linear_layer_sparsity": 0.9170125748456154, + "compression/movement_sparsity/model_sparsity": 0.8855103718390196, + "compression_loss": 156.65145874023438, + "distillation_loss": 7.758843421936035, + "epoch": 1.71, + "learning_rate": 3.780340538582297e-05, + "loss": 163.636, + "step": 2020, + "task_loss": 3.7144312858581543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4622406442057163, + "compression/movement_sparsity/importance_threshold": -0.00011488705409681919, + "compression/movement_sparsity/linear_layer_sparsity": 0.917101755695364, + "compression/movement_sparsity/model_sparsity": 0.8855964890522269, + "compression_loss": 156.68531799316406, + "distillation_loss": 6.799404144287109, + "epoch": 1.71, + "learning_rate": 3.7797367467697144e-05, + "loss": 163.1987, + "step": 2021, + "task_loss": 3.8739829063415527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4625661540939394, + "compression/movement_sparsity/importance_threshold": -0.00011389665393371582, + "compression/movement_sparsity/linear_layer_sparsity": 0.9172955949644527, + "compression/movement_sparsity/model_sparsity": 0.885783669346103, + "compression_loss": 156.71885681152344, + "distillation_loss": 7.377967834472656, + "epoch": 1.71, + "learning_rate": 3.779132954957131e-05, + "loss": 163.2695, + "step": 2022, + "task_loss": 4.174900054931641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.462889787849687, + "compression/movement_sparsity/importance_threshold": -0.00011291196211305268, + "compression/movement_sparsity/linear_layer_sparsity": 0.9174443966523805, + "compression/movement_sparsity/model_sparsity": 0.885927359238283, + "compression_loss": 156.75222778320312, + "distillation_loss": 6.577198028564453, + "epoch": 1.71, + "learning_rate": 3.778529163144548e-05, + "loss": 162.9832, + "step": 2023, + "task_loss": 3.614705801010132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4632115508953074, + "compression/movement_sparsity/importance_threshold": -0.00011193296213673088, + "compression/movement_sparsity/linear_layer_sparsity": 0.9175784839174458, + "compression/movement_sparsity/model_sparsity": 0.8860568401932927, + "compression_loss": 156.78543090820312, + "distillation_loss": 8.962401390075684, + "epoch": 1.71, + "learning_rate": 3.777925371331965e-05, + "loss": 163.4442, + "step": 2024, + "task_loss": 4.668178558349609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4635314486531485, + "compression/movement_sparsity/importance_threshold": -0.00011095963750664978, + "compression/movement_sparsity/linear_layer_sparsity": 0.9177568933136137, + "compression/movement_sparsity/model_sparsity": 0.8862291206778505, + "compression_loss": 156.81838989257812, + "distillation_loss": 7.469170093536377, + "epoch": 1.71, + "learning_rate": 3.777321579519382e-05, + "loss": 163.1041, + "step": 2025, + "task_loss": 3.1402318477630615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4638494865455587, + "compression/movement_sparsity/importance_threshold": -0.00010999197172471137, + "compression/movement_sparsity/linear_layer_sparsity": 0.9179160690273848, + "compression/movement_sparsity/model_sparsity": 0.8863828282161719, + "compression_loss": 156.85110473632812, + "distillation_loss": 5.778111934661865, + "epoch": 1.71, + "learning_rate": 3.776717787706799e-05, + "loss": 162.3084, + "step": 2026, + "task_loss": 3.376955986022949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4641656699948862, + "compression/movement_sparsity/importance_threshold": -0.000109029948292815, + "compression/movement_sparsity/linear_layer_sparsity": 0.9180500370507737, + "compression/movement_sparsity/model_sparsity": 0.8865121940258236, + "compression_loss": 156.8836669921875, + "distillation_loss": 7.7392578125, + "epoch": 1.71, + "learning_rate": 3.776113995894216e-05, + "loss": 163.0562, + "step": 2027, + "task_loss": 4.390015125274658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4644800044234794, + "compression/movement_sparsity/importance_threshold": -0.0001080735507128618, + "compression/movement_sparsity/linear_layer_sparsity": 0.9182072452768849, + "compression/movement_sparsity/model_sparsity": 0.8866640016657388, + "compression_loss": 156.9159393310547, + "distillation_loss": 8.972311973571777, + "epoch": 1.71, + "learning_rate": 3.775510204081633e-05, + "loss": 163.8257, + "step": 2028, + "task_loss": 3.9244768619537354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4647924952536862, + "compression/movement_sparsity/importance_threshold": -0.00010712276248675111, + "compression/movement_sparsity/linear_layer_sparsity": 0.9183535548137769, + "compression/movement_sparsity/model_sparsity": 0.8868052850199378, + "compression_loss": 156.9480743408203, + "distillation_loss": 6.731687545776367, + "epoch": 1.71, + "learning_rate": 3.7749064122690495e-05, + "loss": 163.0141, + "step": 2029, + "task_loss": 3.1125693321228027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4651031479078545, + "compression/movement_sparsity/importance_threshold": -0.00010617756711638494, + "compression/movement_sparsity/linear_layer_sparsity": 0.9184603238107885, + "compression/movement_sparsity/model_sparsity": 0.8869083861734421, + "compression_loss": 156.97998046875, + "distillation_loss": 7.222890853881836, + "epoch": 1.72, + "learning_rate": 3.774302620456467e-05, + "loss": 163.2517, + "step": 2030, + "task_loss": 3.2845473289489746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4654119678083333, + "compression/movement_sparsity/importance_threshold": -0.00010523794810366176, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186254616083775, + "compression/movement_sparsity/model_sparsity": 0.8870678509796608, + "compression_loss": 157.01168823242188, + "distillation_loss": 6.474638938903809, + "epoch": 1.72, + "learning_rate": 3.773698828643884e-05, + "loss": 163.4867, + "step": 2031, + "task_loss": 3.074014663696289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4657189603774703, + "compression/movement_sparsity/importance_threshold": -0.00010430388895048357, + "compression/movement_sparsity/linear_layer_sparsity": 0.918727973677543, + "compression/movement_sparsity/model_sparsity": 0.8871668414438864, + "compression_loss": 157.04327392578125, + "distillation_loss": 6.977015495300293, + "epoch": 1.72, + "learning_rate": 3.7730950368313004e-05, + "loss": 163.6655, + "step": 2032, + "task_loss": 3.273977756500244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4660241310376136, + "compression/movement_sparsity/importance_threshold": -0.0001033753731587506, + "compression/movement_sparsity/linear_layer_sparsity": 0.91881262334359, + "compression/movement_sparsity/model_sparsity": 0.8872485831334919, + "compression_loss": 157.07443237304688, + "distillation_loss": 8.870085716247559, + "epoch": 1.72, + "learning_rate": 3.772491245018718e-05, + "loss": 163.3332, + "step": 2033, + "task_loss": 4.21150016784668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4663274852111117, + "compression/movement_sparsity/importance_threshold": -0.00010245238423036394, + "compression/movement_sparsity/linear_layer_sparsity": 0.9189579431745682, + "compression/movement_sparsity/model_sparsity": 0.88738891078122, + "compression_loss": 157.1055450439453, + "distillation_loss": 6.959296226501465, + "epoch": 1.72, + "learning_rate": 3.771887453206135e-05, + "loss": 163.528, + "step": 2034, + "task_loss": 3.035745620727539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4666290283203125, + "compression/movement_sparsity/importance_threshold": -0.00010153490566722212, + "compression/movement_sparsity/linear_layer_sparsity": 0.9191849793263551, + "compression/movement_sparsity/model_sparsity": 0.887608147542748, + "compression_loss": 157.13644409179688, + "distillation_loss": 5.344354629516602, + "epoch": 1.72, + "learning_rate": 3.771283661393551e-05, + "loss": 163.2162, + "step": 2035, + "task_loss": 2.107182502746582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4669287657875645, + "compression/movement_sparsity/importance_threshold": -0.0001006229209712271, + "compression/movement_sparsity/linear_layer_sparsity": 0.9193297387214545, + "compression/movement_sparsity/model_sparsity": 0.8877479340072937, + "compression_loss": 157.16722106933594, + "distillation_loss": 6.175367832183838, + "epoch": 1.72, + "learning_rate": 3.7706798695809686e-05, + "loss": 163.6404, + "step": 2036, + "task_loss": 3.0417799949645996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4672267030352155, + "compression/movement_sparsity/importance_threshold": -9.971641364427911e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9194868754025599, + "compression/movement_sparsity/model_sparsity": 0.8878996725599941, + "compression_loss": 157.19764709472656, + "distillation_loss": 7.253015518188477, + "epoch": 1.72, + "learning_rate": 3.770076077768386e-05, + "loss": 164.3176, + "step": 2037, + "task_loss": 4.325547695159912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4675228454856142, + "compression/movement_sparsity/importance_threshold": -9.881536718827841e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9195677451074663, + "compression/movement_sparsity/model_sparsity": 0.8879777641417528, + "compression_loss": 157.2279510498047, + "distillation_loss": 7.390590667724609, + "epoch": 1.72, + "learning_rate": 3.769472285955802e-05, + "loss": 164.2178, + "step": 2038, + "task_loss": 3.8896913528442383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4678171985611084, + "compression/movement_sparsity/importance_threshold": -9.791976510512609e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197364363070109, + "compression/movement_sparsity/model_sparsity": 0.8881406602796381, + "compression_loss": 157.25808715820312, + "distillation_loss": 6.069460868835449, + "epoch": 1.72, + "learning_rate": 3.7688684941432194e-05, + "loss": 163.5156, + "step": 2039, + "task_loss": 2.217817544937134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4681097676840464, + "compression/movement_sparsity/importance_threshold": -9.702959089672065e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199007871095359, + "compression/movement_sparsity/model_sparsity": 0.8882993651264942, + "compression_loss": 157.28807067871094, + "distillation_loss": 6.967180252075195, + "epoch": 1.72, + "learning_rate": 3.768264702330637e-05, + "loss": 164.5535, + "step": 2040, + "task_loss": 3.6561732292175293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4684005582767765, + "compression/movement_sparsity/importance_threshold": -9.614482806496582e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200337773511787, + "compression/movement_sparsity/model_sparsity": 0.8884277867442107, + "compression_loss": 157.31784057617188, + "distillation_loss": 6.5490522384643555, + "epoch": 1.72, + "learning_rate": 3.7676609105180535e-05, + "loss": 163.5856, + "step": 2041, + "task_loss": 3.040182113647461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4686895757616467, + "compression/movement_sparsity/importance_threshold": -9.526546011175922e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.920130076929006, + "compression/movement_sparsity/model_sparsity": 0.8885207781352875, + "compression_loss": 157.34742736816406, + "distillation_loss": 7.280935764312744, + "epoch": 1.73, + "learning_rate": 3.76705711870547e-05, + "loss": 164.0972, + "step": 2042, + "task_loss": 4.1884846687316895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4689768255610054, + "compression/movement_sparsity/importance_threshold": -9.439147053900195e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202608254271333, + "compression/movement_sparsity/model_sparsity": 0.8886470350202746, + "compression_loss": 157.37689208984375, + "distillation_loss": 8.016792297363281, + "epoch": 1.73, + "learning_rate": 3.7664533268928877e-05, + "loss": 164.4515, + "step": 2043, + "task_loss": 4.070981025695801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4692623130972007, + "compression/movement_sparsity/importance_threshold": -9.3522842848596e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.920360117971037, + "compression/movement_sparsity/model_sparsity": 0.8887429165598357, + "compression_loss": 157.4061737060547, + "distillation_loss": 7.001385688781738, + "epoch": 1.73, + "learning_rate": 3.7658495350803044e-05, + "loss": 163.9293, + "step": 2044, + "task_loss": 2.102085828781128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4695460437925807, + "compression/movement_sparsity/importance_threshold": -9.2659560542439e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9205359278986602, + "compression/movement_sparsity/model_sparsity": 0.8889126868755904, + "compression_loss": 157.43533325195312, + "distillation_loss": 6.396998405456543, + "epoch": 1.73, + "learning_rate": 3.765245743267721e-05, + "loss": 164.1069, + "step": 2045, + "task_loss": 2.826390504837036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4698280230694938, + "compression/movement_sparsity/importance_threshold": -9.180160712243465e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9206228670048933, + "compression/movement_sparsity/model_sparsity": 0.8889966393560684, + "compression_loss": 157.4640655517578, + "distillation_loss": 6.79403018951416, + "epoch": 1.73, + "learning_rate": 3.7646419514551385e-05, + "loss": 164.2619, + "step": 2046, + "task_loss": 2.982492685317993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4701082563502879, + "compression/movement_sparsity/importance_threshold": -9.094896609048147e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9207448989364787, + "compression/movement_sparsity/model_sparsity": 0.8891144791153898, + "compression_loss": 157.49285888671875, + "distillation_loss": 6.070186614990234, + "epoch": 1.73, + "learning_rate": 3.764038159642556e-05, + "loss": 163.9935, + "step": 2047, + "task_loss": 2.507523775100708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4703867490573115, + "compression/movement_sparsity/importance_threshold": -9.010162094847968e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9209042296644291, + "compression/movement_sparsity/model_sparsity": 0.8892683363426764, + "compression_loss": 157.521484375, + "distillation_loss": 7.141257286071777, + "epoch": 1.73, + "learning_rate": 3.763434367829972e-05, + "loss": 164.0591, + "step": 2048, + "task_loss": 3.3685402870178223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4706635066129126, + "compression/movement_sparsity/importance_threshold": -8.92595551983304e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211059269599897, + "compression/movement_sparsity/model_sparsity": 0.8894631047156412, + "compression_loss": 157.54974365234375, + "distillation_loss": 6.28179407119751, + "epoch": 1.73, + "learning_rate": 3.762830576017389e-05, + "loss": 163.2027, + "step": 2049, + "task_loss": 3.0805306434631348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4709385344394397, + "compression/movement_sparsity/importance_threshold": -8.842275234193474e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9212482776732267, + "compression/movement_sparsity/model_sparsity": 0.8896005652439564, + "compression_loss": 157.57791137695312, + "distillation_loss": 8.601505279541016, + "epoch": 1.73, + "learning_rate": 3.762226784204807e-05, + "loss": 164.8551, + "step": 2050, + "task_loss": 3.607563018798828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4712118379592405, + "compression/movement_sparsity/importance_threshold": -8.759119588119205e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9214339488874846, + "compression/movement_sparsity/model_sparsity": 0.8897798580808132, + "compression_loss": 157.60595703125, + "distillation_loss": 6.859668254852295, + "epoch": 1.73, + "learning_rate": 3.7616229923922234e-05, + "loss": 164.269, + "step": 2051, + "task_loss": 2.931633949279785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4714834225946638, + "compression/movement_sparsity/importance_threshold": -8.676486931800259e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9215043253248716, + "compression/movement_sparsity/model_sparsity": 0.8898478168710725, + "compression_loss": 157.63377380371094, + "distillation_loss": 6.47484016418457, + "epoch": 1.73, + "learning_rate": 3.76101920057964e-05, + "loss": 163.8806, + "step": 2052, + "task_loss": 2.7673532962799072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.471753293768057, + "compression/movement_sparsity/importance_threshold": -8.594375615426832e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216246401763174, + "compression/movement_sparsity/model_sparsity": 0.8899639985372394, + "compression_loss": 157.66146850585938, + "distillation_loss": 7.326955795288086, + "epoch": 1.73, + "learning_rate": 3.7604154087670576e-05, + "loss": 164.5384, + "step": 2053, + "task_loss": 3.176683187484741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.472021456901769, + "compression/movement_sparsity/importance_threshold": -8.512783989188862e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9217608380190543, + "compression/movement_sparsity/model_sparsity": 0.8900955175650846, + "compression_loss": 157.6890106201172, + "distillation_loss": 5.722264766693115, + "epoch": 1.74, + "learning_rate": 3.759811616954474e-05, + "loss": 164.0584, + "step": 2054, + "task_loss": 2.232388734817505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4722879174181478, + "compression/movement_sparsity/importance_threshold": -8.431710403276373e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219039638031875, + "compression/movement_sparsity/model_sparsity": 0.8902337265382265, + "compression_loss": 157.71612548828125, + "distillation_loss": 5.818230628967285, + "epoch": 1.74, + "learning_rate": 3.759207825141891e-05, + "loss": 164.3617, + "step": 2055, + "task_loss": 2.535738945007324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4725526807395415, + "compression/movement_sparsity/importance_threshold": -8.351153207879474e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219865386640659, + "compression/movement_sparsity/model_sparsity": 0.8903134646986037, + "compression_loss": 157.7432861328125, + "distillation_loss": 6.892061710357666, + "epoch": 1.74, + "learning_rate": 3.7586040333293084e-05, + "loss": 164.2182, + "step": 2056, + "task_loss": 3.531475067138672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4728157522882985, + "compression/movement_sparsity/importance_threshold": -8.27111075318819e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9221089760173509, + "compression/movement_sparsity/model_sparsity": 0.8904316959521421, + "compression_loss": 157.77020263671875, + "distillation_loss": 6.085649490356445, + "epoch": 1.74, + "learning_rate": 3.758000241516725e-05, + "loss": 163.9001, + "step": 2057, + "task_loss": 3.5105717182159424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4730771374867666, + "compression/movement_sparsity/importance_threshold": -8.191581389392546e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.92223900906542, + "compression/movement_sparsity/model_sparsity": 0.8905572619649815, + "compression_loss": 157.7969970703125, + "distillation_loss": 6.869007110595703, + "epoch": 1.74, + "learning_rate": 3.757396449704142e-05, + "loss": 164.4945, + "step": 2058, + "task_loss": 3.155095100402832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4733368417572943, + "compression/movement_sparsity/importance_threshold": -8.11256346668265e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223908394919275, + "compression/movement_sparsity/model_sparsity": 0.8907038765492534, + "compression_loss": 157.82359313964844, + "distillation_loss": 6.042092800140381, + "epoch": 1.74, + "learning_rate": 3.756792657891559e-05, + "loss": 163.8759, + "step": 2059, + "task_loss": 3.681732177734375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4735948705222297, + "compression/movement_sparsity/importance_threshold": -8.034055335248442e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9225007803175301, + "compression/movement_sparsity/model_sparsity": 0.8908100405692791, + "compression_loss": 157.85003662109375, + "distillation_loss": 6.541399955749512, + "epoch": 1.74, + "learning_rate": 3.756188866078976e-05, + "loss": 164.6581, + "step": 2060, + "task_loss": 2.393188953399658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.473851229203921, + "compression/movement_sparsity/importance_threshold": -7.956055345280117e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9225234243118706, + "compression/movement_sparsity/model_sparsity": 0.8908319066727529, + "compression_loss": 157.87637329101562, + "distillation_loss": 5.936201572418213, + "epoch": 1.74, + "learning_rate": 3.755585074266393e-05, + "loss": 163.7279, + "step": 2061, + "task_loss": 3.381924867630005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4741059232247165, + "compression/movement_sparsity/importance_threshold": -7.878561846967613e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9225495263148255, + "compression/movement_sparsity/model_sparsity": 0.8908571119916072, + "compression_loss": 157.90243530273438, + "distillation_loss": 6.148585319519043, + "epoch": 1.74, + "learning_rate": 3.75498128245381e-05, + "loss": 164.0779, + "step": 2062, + "task_loss": 3.1262643337249756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4743589580069643, + "compression/movement_sparsity/importance_threshold": -7.801573190500954e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9226355234118153, + "compression/movement_sparsity/model_sparsity": 0.8909401548237574, + "compression_loss": 157.92831420898438, + "distillation_loss": 6.561042785644531, + "epoch": 1.74, + "learning_rate": 3.7543774906412274e-05, + "loss": 164.9616, + "step": 2063, + "task_loss": 3.166872501373291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4746103389730125, + "compression/movement_sparsity/importance_threshold": -7.725087726070251e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9227817017828633, + "compression/movement_sparsity/model_sparsity": 0.8910813115180627, + "compression_loss": 157.95407104492188, + "distillation_loss": 8.221183776855469, + "epoch": 1.74, + "learning_rate": 3.753773698828644e-05, + "loss": 164.642, + "step": 2064, + "task_loss": 3.5372185707092285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4748600715452094, + "compression/movement_sparsity/importance_threshold": -7.649103803865527e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9228398678725911, + "compression/movement_sparsity/model_sparsity": 0.8911374794236685, + "compression_loss": 157.97955322265625, + "distillation_loss": 6.502510070800781, + "epoch": 1.75, + "learning_rate": 3.753169907016061e-05, + "loss": 164.1097, + "step": 2065, + "task_loss": 2.5702710151672363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4751081611459034, + "compression/movement_sparsity/importance_threshold": -7.573619774076893e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230080821054303, + "compression/movement_sparsity/model_sparsity": 0.8912999149801221, + "compression_loss": 158.0048828125, + "distillation_loss": 7.297700881958008, + "epoch": 1.75, + "learning_rate": 3.752566115203478e-05, + "loss": 164.8532, + "step": 2066, + "task_loss": 2.9602088928222656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.475354613197442, + "compression/movement_sparsity/importance_threshold": -7.498633986894286e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9232413307485554, + "compression/movement_sparsity/model_sparsity": 0.8915251508147991, + "compression_loss": 158.02993774414062, + "distillation_loss": 7.440400123596191, + "epoch": 1.75, + "learning_rate": 3.751962323390895e-05, + "loss": 164.6024, + "step": 2067, + "task_loss": 4.1229143142700195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4755994331221745, + "compression/movement_sparsity/importance_threshold": -7.424144792507817e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233652466986273, + "compression/movement_sparsity/model_sparsity": 0.891644809870776, + "compression_loss": 158.0548553466797, + "distillation_loss": 7.882330894470215, + "epoch": 1.75, + "learning_rate": 3.751358531578312e-05, + "loss": 165.3719, + "step": 2068, + "task_loss": 3.4940640926361084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4758426263424482, + "compression/movement_sparsity/importance_threshold": -7.350150541107422e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234522335015308, + "compression/movement_sparsity/model_sparsity": 0.8917288084093971, + "compression_loss": 158.0797119140625, + "distillation_loss": 9.107924461364746, + "epoch": 1.75, + "learning_rate": 3.750754739765729e-05, + "loss": 165.4019, + "step": 2069, + "task_loss": 4.153846263885498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4760841982806114, + "compression/movement_sparsity/importance_threshold": -7.276649582883387e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236599167292447, + "compression/movement_sparsity/model_sparsity": 0.8919293570793306, + "compression_loss": 158.10440063476562, + "distillation_loss": 6.349693298339844, + "epoch": 1.75, + "learning_rate": 3.750150947953146e-05, + "loss": 164.0524, + "step": 2070, + "task_loss": 2.878816843032837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4763241543590127, + "compression/movement_sparsity/importance_threshold": -7.203640268025474e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237574444963385, + "compression/movement_sparsity/model_sparsity": 0.8920235344675942, + "compression_loss": 158.1288299560547, + "distillation_loss": 5.780635356903076, + "epoch": 1.75, + "learning_rate": 3.749547156140563e-05, + "loss": 163.8705, + "step": 2071, + "task_loss": 2.8883330821990967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4765625, + "compression/movement_sparsity/importance_threshold": -7.131120946723968e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237529252368044, + "compression/movement_sparsity/model_sparsity": 0.8920191704585281, + "compression_loss": 158.15306091308594, + "distillation_loss": 7.023695468902588, + "epoch": 1.75, + "learning_rate": 3.74894336432798e-05, + "loss": 164.5476, + "step": 2072, + "task_loss": 2.204735517501831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4767992406259216, + "compression/movement_sparsity/importance_threshold": -7.059089969168805e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238106978290002, + "compression/movement_sparsity/model_sparsity": 0.8920749583844526, + "compression_loss": 158.17724609375, + "distillation_loss": 7.226841926574707, + "epoch": 1.75, + "learning_rate": 3.748339572515397e-05, + "loss": 164.8711, + "step": 2073, + "task_loss": 3.5587823390960693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4770343816591256, + "compression/movement_sparsity/importance_threshold": -6.98754568555001e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9239283416668956, + "compression/movement_sparsity/model_sparsity": 0.8921885607946015, + "compression_loss": 158.2010955810547, + "distillation_loss": 7.537065505981445, + "epoch": 1.75, + "learning_rate": 3.747735780702814e-05, + "loss": 164.161, + "step": 2074, + "task_loss": 2.887624740600586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.47726792852196, + "compression/movement_sparsity/importance_threshold": -6.916486446057607e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240291128075863, + "compression/movement_sparsity/model_sparsity": 0.8922858701366012, + "compression_loss": 158.22486877441406, + "distillation_loss": 6.06566047668457, + "epoch": 1.75, + "learning_rate": 3.747131988890231e-05, + "loss": 164.3281, + "step": 2075, + "task_loss": 3.465578317642212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4774998866367737, + "compression/movement_sparsity/importance_threshold": -6.845910600881792e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241099109674868, + "compression/movement_sparsity/model_sparsity": 0.892363892631145, + "compression_loss": 158.24842834472656, + "distillation_loss": 5.879219055175781, + "epoch": 1.75, + "learning_rate": 3.7465281970776475e-05, + "loss": 164.9828, + "step": 2076, + "task_loss": 2.490217924118042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4777302614259142, + "compression/movement_sparsity/importance_threshold": -6.775816500212504e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243039410232576, + "compression/movement_sparsity/model_sparsity": 0.8925512571575938, + "compression_loss": 158.27174377441406, + "distillation_loss": 8.313295364379883, + "epoch": 1.76, + "learning_rate": 3.745924405265065e-05, + "loss": 165.4238, + "step": 2077, + "task_loss": 3.354480504989624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.47795905831173, + "compression/movement_sparsity/importance_threshold": -6.706202494239765e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.924402005377895, + "compression/movement_sparsity/model_sparsity": 0.892645952699968, + "compression_loss": 158.2950897216797, + "distillation_loss": 6.969394683837891, + "epoch": 1.76, + "learning_rate": 3.7453206134524816e-05, + "loss": 164.8258, + "step": 2078, + "task_loss": 2.808372735977173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4781862827165693, + "compression/movement_sparsity/importance_threshold": -6.6370669331536e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9245530726576737, + "compression/movement_sparsity/model_sparsity": 0.8927918303539492, + "compression_loss": 158.3180694580078, + "distillation_loss": 5.652746200561523, + "epoch": 1.76, + "learning_rate": 3.744716821639899e-05, + "loss": 164.6437, + "step": 2079, + "task_loss": 1.7458292245864868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.47841194006278, + "compression/movement_sparsity/importance_threshold": -6.568408167144206e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9246555489543363, + "compression/movement_sparsity/model_sparsity": 0.8928907862745674, + "compression_loss": 158.34088134765625, + "distillation_loss": 6.885915756225586, + "epoch": 1.76, + "learning_rate": 3.744113029827316e-05, + "loss": 165.0661, + "step": 2080, + "task_loss": 2.5404088497161865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4786360357727106, + "compression/movement_sparsity/importance_threshold": -6.500224546401433e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9248361166248462, + "compression/movement_sparsity/model_sparsity": 0.8930651508901041, + "compression_loss": 158.3636474609375, + "distillation_loss": 6.806939125061035, + "epoch": 1.76, + "learning_rate": 3.743509238014733e-05, + "loss": 164.6045, + "step": 2081, + "task_loss": 2.705115556716919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4788585752687093, + "compression/movement_sparsity/importance_threshold": -6.43251442111548e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249612011433465, + "compression/movement_sparsity/model_sparsity": 0.8931859383705889, + "compression_loss": 158.38613891601562, + "distillation_loss": 7.433000564575195, + "epoch": 1.76, + "learning_rate": 3.74290544620215e-05, + "loss": 164.6337, + "step": 2082, + "task_loss": 3.7706639766693115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4790795639731242, + "compression/movement_sparsity/importance_threshold": -6.365276141476368e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250869414910666, + "compression/movement_sparsity/model_sparsity": 0.8933073591505423, + "compression_loss": 158.40843200683594, + "distillation_loss": 7.363675117492676, + "epoch": 1.76, + "learning_rate": 3.7423016543895666e-05, + "loss": 165.2568, + "step": 2083, + "task_loss": 3.91349458694458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4792990073083037, + "compression/movement_sparsity/importance_threshold": -6.298508057674037e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9251610263445882, + "compression/movement_sparsity/model_sparsity": 0.8933788989614337, + "compression_loss": 158.43051147460938, + "distillation_loss": 7.433717727661133, + "epoch": 1.76, + "learning_rate": 3.741697862576984e-05, + "loss": 165.9625, + "step": 2084, + "task_loss": 3.17760968208313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4795169106965955, + "compression/movement_sparsity/importance_threshold": -6.232208519898683e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9252880783507483, + "compression/movement_sparsity/model_sparsity": 0.8935015863403246, + "compression_loss": 158.45254516601562, + "distillation_loss": 6.165663719177246, + "epoch": 1.76, + "learning_rate": 3.741094070764401e-05, + "loss": 164.7903, + "step": 2085, + "task_loss": 4.272960662841797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4797332795603482, + "compression/movement_sparsity/importance_threshold": -6.166375878340329e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9254304648364882, + "compression/movement_sparsity/model_sparsity": 0.8936390814122472, + "compression_loss": 158.4743194580078, + "distillation_loss": 6.613044738769531, + "epoch": 1.76, + "learning_rate": 3.7404902789518174e-05, + "loss": 165.0626, + "step": 2086, + "task_loss": 2.774812698364258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4799481193219097, + "compression/movement_sparsity/importance_threshold": -6.101008483188827e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9254707685230974, + "compression/movement_sparsity/model_sparsity": 0.8936780005432328, + "compression_loss": 158.49598693847656, + "distillation_loss": 7.976287841796875, + "epoch": 1.76, + "learning_rate": 3.739886487139235e-05, + "loss": 165.0246, + "step": 2087, + "task_loss": 2.2189197540283203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4801614354036285, + "compression/movement_sparsity/importance_threshold": -6.036104684634461e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9254908965180667, + "compression/movement_sparsity/model_sparsity": 0.893697437079654, + "compression_loss": 158.5174560546875, + "distillation_loss": 6.275365829467773, + "epoch": 1.76, + "learning_rate": 3.7392826953266515e-05, + "loss": 164.8943, + "step": 2088, + "task_loss": 2.8189098834991455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.480373233227853, + "compression/movement_sparsity/importance_threshold": -5.971662832867167e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9255871722475587, + "compression/movement_sparsity/model_sparsity": 0.8937904054416591, + "compression_loss": 158.53890991210938, + "distillation_loss": 8.053070068359375, + "epoch": 1.77, + "learning_rate": 3.738678903514068e-05, + "loss": 165.6037, + "step": 2089, + "task_loss": 3.9769766330718994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4805835182169307, + "compression/movement_sparsity/importance_threshold": -5.907681278077056e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9257199597783516, + "compression/movement_sparsity/model_sparsity": 0.8939186313122671, + "compression_loss": 158.55996704101562, + "distillation_loss": 7.240005016326904, + "epoch": 1.77, + "learning_rate": 3.7380751117014856e-05, + "loss": 165.9497, + "step": 2090, + "task_loss": 3.2059333324432373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4807922957932103, + "compression/movement_sparsity/importance_threshold": -5.844158370454065e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.925776957299651, + "compression/movement_sparsity/model_sparsity": 0.893973670793365, + "compression_loss": 158.5808563232422, + "distillation_loss": 10.765399932861328, + "epoch": 1.77, + "learning_rate": 3.737471319888903e-05, + "loss": 166.35, + "step": 2091, + "task_loss": 4.414034843444824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4809995713790398, + "compression/movement_sparsity/importance_threshold": -5.781092460188306e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9258721956265588, + "compression/movement_sparsity/model_sparsity": 0.894065637390756, + "compression_loss": 158.60189819335938, + "distillation_loss": 6.704811096191406, + "epoch": 1.77, + "learning_rate": 3.736867528076319e-05, + "loss": 165.655, + "step": 2092, + "task_loss": 3.7092132568359375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4812053503967673, + "compression/movement_sparsity/importance_threshold": -5.7184818974698876e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9260008573953497, + "compression/movement_sparsity/model_sparsity": 0.8941898792319791, + "compression_loss": 158.62269592285156, + "distillation_loss": 6.135540008544922, + "epoch": 1.77, + "learning_rate": 3.7362637362637365e-05, + "loss": 164.2481, + "step": 2093, + "task_loss": 2.849139928817749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4814096382687416, + "compression/movement_sparsity/importance_threshold": -5.656325032488661e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9261704667558022, + "compression/movement_sparsity/model_sparsity": 0.8943536619891206, + "compression_loss": 158.64329528808594, + "distillation_loss": 5.725130081176758, + "epoch": 1.77, + "learning_rate": 3.735659944451154e-05, + "loss": 164.6533, + "step": 2094, + "task_loss": 2.984590530395508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.48161244041731, + "compression/movement_sparsity/importance_threshold": -5.594620215434824e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9262539836259238, + "compression/movement_sparsity/model_sparsity": 0.8944343097978256, + "compression_loss": 158.66375732421875, + "distillation_loss": 5.930133819580078, + "epoch": 1.77, + "learning_rate": 3.7350561526385706e-05, + "loss": 164.586, + "step": 2095, + "task_loss": 3.3640546798706055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4818137622648213, + "compression/movement_sparsity/importance_threshold": -5.533365796498487e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.926468857126722, + "compression/movement_sparsity/model_sparsity": 0.8946418017328432, + "compression_loss": 158.68405151367188, + "distillation_loss": 6.960702896118164, + "epoch": 1.77, + "learning_rate": 3.734452360825987e-05, + "loss": 164.3518, + "step": 2096, + "task_loss": 2.9138264656066895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4820136092336238, + "compression/movement_sparsity/importance_threshold": -5.472560125869587e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9265660748654573, + "compression/movement_sparsity/model_sparsity": 0.8947356797431761, + "compression_loss": 158.7042236328125, + "distillation_loss": 6.667351722717285, + "epoch": 1.77, + "learning_rate": 3.733848569013405e-05, + "loss": 164.567, + "step": 2097, + "task_loss": 3.5075442790985107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4822119867460652, + "compression/movement_sparsity/importance_threshold": -5.4122015537381475e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9266823354999069, + "compression/movement_sparsity/model_sparsity": 0.8948479464671729, + "compression_loss": 158.7241668701172, + "distillation_loss": 4.760207176208496, + "epoch": 1.77, + "learning_rate": 3.7332447772008214e-05, + "loss": 164.973, + "step": 2098, + "task_loss": 1.6393064260482788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4824089002244938, + "compression/movement_sparsity/importance_threshold": -5.352288430294366e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9267844063748699, + "compression/movement_sparsity/model_sparsity": 0.8949465108935741, + "compression_loss": 158.74392700195312, + "distillation_loss": 6.518259048461914, + "epoch": 1.77, + "learning_rate": 3.732640985388238e-05, + "loss": 164.707, + "step": 2099, + "task_loss": 3.243020534515381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4826043550912582, + "compression/movement_sparsity/importance_threshold": -5.292819105728007e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9269328384136011, + "compression/movement_sparsity/model_sparsity": 0.8950898438351446, + "compression_loss": 158.7636260986328, + "distillation_loss": 5.039811134338379, + "epoch": 1.77, + "learning_rate": 3.7320371935756555e-05, + "loss": 164.335, + "step": 2100, + "task_loss": 1.7412668466567993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4827983567687062, + "compression/movement_sparsity/importance_threshold": -5.23379193022944e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.927053904487608, + "compression/movement_sparsity/model_sparsity": 0.8952067509170666, + "compression_loss": 158.78305053710938, + "distillation_loss": 8.121597290039062, + "epoch": 1.78, + "learning_rate": 3.731433401763072e-05, + "loss": 165.423, + "step": 2101, + "task_loss": 4.506771087646484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.482990910679186, + "compression/movement_sparsity/importance_threshold": -5.1752052539885164e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9271582290302542, + "compression/movement_sparsity/model_sparsity": 0.895307491590733, + "compression_loss": 158.80238342285156, + "distillation_loss": 7.60074520111084, + "epoch": 1.78, + "learning_rate": 3.730829609950489e-05, + "loss": 164.6981, + "step": 2102, + "task_loss": 4.195941925048828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.483182022245046, + "compression/movement_sparsity/importance_threshold": -5.1170574271952596e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.927309391703374, + "compression/movement_sparsity/model_sparsity": 0.8954534613610003, + "compression_loss": 158.82159423828125, + "distillation_loss": 7.900430679321289, + "epoch": 1.78, + "learning_rate": 3.7302258181379063e-05, + "loss": 165.3778, + "step": 2103, + "task_loss": 4.078195095062256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4833716968886344, + "compression/movement_sparsity/importance_threshold": -5.059346800039867e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9273589127715658, + "compression/movement_sparsity/model_sparsity": 0.8955012812281551, + "compression_loss": 158.84059143066406, + "distillation_loss": 6.955070495605469, + "epoch": 1.78, + "learning_rate": 3.729622026325324e-05, + "loss": 165.3256, + "step": 2104, + "task_loss": 2.5524044036865234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4835599400322992, + "compression/movement_sparsity/importance_threshold": -5.002071722712276e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9274604351348175, + "compression/movement_sparsity/model_sparsity": 0.8955993159859098, + "compression_loss": 158.8595428466797, + "distillation_loss": 8.337209701538086, + "epoch": 1.78, + "learning_rate": 3.72901823451274e-05, + "loss": 166.1333, + "step": 2105, + "task_loss": 4.282992839813232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4837467570983887, + "compression/movement_sparsity/importance_threshold": -4.9452305454025965e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9275472430752065, + "compression/movement_sparsity/model_sparsity": 0.8956831418064941, + "compression_loss": 158.87820434570312, + "distillation_loss": 5.976417541503906, + "epoch": 1.78, + "learning_rate": 3.728414442700157e-05, + "loss": 165.164, + "step": 2106, + "task_loss": 3.3806724548339844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4839321535092513, + "compression/movement_sparsity/importance_threshold": -4.888821618300766e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9276258114157593, + "compression/movement_sparsity/model_sparsity": 0.8957590110828443, + "compression_loss": 158.8968048095703, + "distillation_loss": 5.764797687530518, + "epoch": 1.78, + "learning_rate": 3.7278106508875746e-05, + "loss": 165.0053, + "step": 2107, + "task_loss": 3.1879451274871826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4841161346872347, + "compression/movement_sparsity/importance_threshold": -4.832843291596896e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.927695639341435, + "compression/movement_sparsity/model_sparsity": 0.8958264402044571, + "compression_loss": 158.9152069091797, + "distillation_loss": 8.304316520690918, + "epoch": 1.78, + "learning_rate": 3.7272068590749906e-05, + "loss": 165.3387, + "step": 2108, + "task_loss": 4.559038162231445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4842987060546875, + "compression/movement_sparsity/importance_threshold": -4.7772939154810956e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9278333277051262, + "compression/movement_sparsity/model_sparsity": 0.8959593985492768, + "compression_loss": 158.9334716796875, + "distillation_loss": 7.059016704559326, + "epoch": 1.78, + "learning_rate": 3.726603067262408e-05, + "loss": 165.2538, + "step": 2109, + "task_loss": 4.099099159240723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4844798730339577, + "compression/movement_sparsity/importance_threshold": -4.722171840143303e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9279292934062596, + "compression/movement_sparsity/model_sparsity": 0.8960520675333512, + "compression_loss": 158.95162963867188, + "distillation_loss": 6.886305809020996, + "epoch": 1.78, + "learning_rate": 3.7259992754498254e-05, + "loss": 165.6854, + "step": 2110, + "task_loss": 3.0369884967803955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4846596410473936, + "compression/movement_sparsity/importance_threshold": -4.667475415773715e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.928077785065829, + "compression/movement_sparsity/model_sparsity": 0.8961954580476006, + "compression_loss": 158.96951293945312, + "distillation_loss": 5.922722816467285, + "epoch": 1.78, + "learning_rate": 3.725395483637242e-05, + "loss": 165.4843, + "step": 2111, + "task_loss": 2.9188270568847656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4848380155173435, + "compression/movement_sparsity/importance_threshold": -4.613202992562095e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9281727729852163, + "compression/movement_sparsity/model_sparsity": 0.89628718283974, + "compression_loss": 158.98728942871094, + "distillation_loss": 7.716068744659424, + "epoch": 1.78, + "learning_rate": 3.724791691824659e-05, + "loss": 165.4163, + "step": 2112, + "task_loss": 3.7409980297088623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4850150018661554, + "compression/movement_sparsity/importance_threshold": -4.559352920698815e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9282400014423473, + "compression/movement_sparsity/model_sparsity": 0.8963521017925495, + "compression_loss": 159.0049285888672, + "distillation_loss": 7.8005828857421875, + "epoch": 1.79, + "learning_rate": 3.724187900012076e-05, + "loss": 165.2228, + "step": 2113, + "task_loss": 2.960425615310669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4851906055161774, + "compression/movement_sparsity/importance_threshold": -4.505923550373811e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9283603997629666, + "compression/movement_sparsity/model_sparsity": 0.896468364060467, + "compression_loss": 159.0223846435547, + "distillation_loss": 8.260976791381836, + "epoch": 1.79, + "learning_rate": 3.723584108199493e-05, + "loss": 165.511, + "step": 2114, + "task_loss": 3.4331555366516113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.485364831889758, + "compression/movement_sparsity/importance_threshold": -4.4529132317770205e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9285026670070301, + "compression/movement_sparsity/model_sparsity": 0.8966057439870316, + "compression_loss": 159.0397186279297, + "distillation_loss": 8.048629760742188, + "epoch": 1.79, + "learning_rate": 3.72298031638691e-05, + "loss": 165.7658, + "step": 2115, + "task_loss": 3.9720163345336914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4855376864092453, + "compression/movement_sparsity/importance_threshold": -4.400320315098554e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9285825947026933, + "compression/movement_sparsity/model_sparsity": 0.8966829259204625, + "compression_loss": 159.05697631835938, + "distillation_loss": 7.3435893058776855, + "epoch": 1.79, + "learning_rate": 3.722376524574327e-05, + "loss": 166.1063, + "step": 2116, + "task_loss": 3.122779607772827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4857091744969875, + "compression/movement_sparsity/importance_threshold": -4.348143150528436e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9286929409499955, + "compression/movement_sparsity/model_sparsity": 0.8967894814347052, + "compression_loss": 159.07412719726562, + "distillation_loss": 9.602479934692383, + "epoch": 1.79, + "learning_rate": 3.721772732761744e-05, + "loss": 166.8232, + "step": 2117, + "task_loss": 3.6714892387390137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4858793015753324, + "compression/movement_sparsity/importance_threshold": -4.2963800882568626e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9287387536020526, + "compression/movement_sparsity/model_sparsity": 0.8968337202812278, + "compression_loss": 159.0909881591797, + "distillation_loss": 6.056572914123535, + "epoch": 1.79, + "learning_rate": 3.7211689409491605e-05, + "loss": 165.3918, + "step": 2118, + "task_loss": 2.5096402168273926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4860480730666286, + "compression/movement_sparsity/importance_threshold": -4.245029478473685e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9288878295458362, + "compression/movement_sparsity/model_sparsity": 0.8969776750077312, + "compression_loss": 159.10777282714844, + "distillation_loss": 6.089041709899902, + "epoch": 1.79, + "learning_rate": 3.720565149136578e-05, + "loss": 165.6644, + "step": 2119, + "task_loss": 3.466416597366333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4862154943932244, + "compression/movement_sparsity/importance_threshold": -4.194089671369014e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9289536747995214, + "compression/movement_sparsity/model_sparsity": 0.8970412582743886, + "compression_loss": 159.12440490722656, + "distillation_loss": 6.5863823890686035, + "epoch": 1.79, + "learning_rate": 3.719961357323995e-05, + "loss": 165.5238, + "step": 2120, + "task_loss": 2.5310258865356445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4863815709774677, + "compression/movement_sparsity/importance_threshold": -4.14355901713296e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9290470648804454, + "compression/movement_sparsity/model_sparsity": 0.8971314401187315, + "compression_loss": 159.14089965820312, + "distillation_loss": 6.355241298675537, + "epoch": 1.79, + "learning_rate": 3.7193575655114113e-05, + "loss": 166.3652, + "step": 2121, + "task_loss": 3.017137050628662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.486546308241707, + "compression/movement_sparsity/importance_threshold": -4.0934358659554596e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.929107758893712, + "compression/movement_sparsity/model_sparsity": 0.8971900491059257, + "compression_loss": 159.1572723388672, + "distillation_loss": 4.4748311042785645, + "epoch": 1.79, + "learning_rate": 3.718753773698829e-05, + "loss": 165.1463, + "step": 2122, + "task_loss": 2.4547290802001953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4867097116082901, + "compression/movement_sparsity/importance_threshold": -4.043718568026624e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9291622881123108, + "compression/movement_sparsity/model_sparsity": 0.8972427050781141, + "compression_loss": 159.17340087890625, + "distillation_loss": 6.962255954742432, + "epoch": 1.79, + "learning_rate": 3.718149981886246e-05, + "loss": 165.5693, + "step": 2123, + "task_loss": 3.743806838989258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4868717864995658, + "compression/movement_sparsity/importance_threshold": -3.994405473536477e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9292816490303457, + "compression/movement_sparsity/model_sparsity": 0.8973579655814174, + "compression_loss": 159.18948364257812, + "distillation_loss": 5.165178298950195, + "epoch": 1.79, + "learning_rate": 3.717546190073663e-05, + "loss": 165.6816, + "step": 2124, + "task_loss": 3.4439103603363037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4870325383378815, + "compression/movement_sparsity/importance_threshold": -3.945494932675129e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9293313370368844, + "compression/movement_sparsity/model_sparsity": 0.8974059466520733, + "compression_loss": 159.20530700683594, + "distillation_loss": 5.668382167816162, + "epoch": 1.8, + "learning_rate": 3.7169423982610796e-05, + "loss": 165.1665, + "step": 2125, + "task_loss": 2.804696798324585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.487191972545586, + "compression/movement_sparsity/importance_threshold": -3.896985295632604e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9294567077354079, + "compression/movement_sparsity/model_sparsity": 0.8975270104814171, + "compression_loss": 159.2210235595703, + "distillation_loss": 6.560277938842773, + "epoch": 1.8, + "learning_rate": 3.716338606448497e-05, + "loss": 164.9624, + "step": 2126, + "task_loss": 3.6128621101379395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.487350094545027, + "compression/movement_sparsity/importance_threshold": -3.848874912598839e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9295994996428474, + "compression/movement_sparsity/model_sparsity": 0.8976648970475567, + "compression_loss": 159.2365264892578, + "distillation_loss": 5.549208641052246, + "epoch": 1.8, + "learning_rate": 3.715734814635914e-05, + "loss": 164.923, + "step": 2127, + "task_loss": 3.7202606201171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4875069097585532, + "compression/movement_sparsity/importance_threshold": -3.8011621337640314e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9296634131813756, + "compression/movement_sparsity/model_sparsity": 0.8977266149594154, + "compression_loss": 159.251953125, + "distillation_loss": 6.0247697830200195, + "epoch": 1.8, + "learning_rate": 3.7151310228233304e-05, + "loss": 165.4937, + "step": 2128, + "task_loss": 3.0278818607330322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4876624236085125, + "compression/movement_sparsity/importance_threshold": -3.7538453093181184e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9297507219368053, + "compression/movement_sparsity/model_sparsity": 0.8978109243905031, + "compression_loss": 159.26710510253906, + "distillation_loss": 6.926674842834473, + "epoch": 1.8, + "learning_rate": 3.714527231010748e-05, + "loss": 165.619, + "step": 2129, + "task_loss": 2.1637282371520996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4878166415172533, + "compression/movement_sparsity/importance_threshold": -3.706922789451124e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9298541640910465, + "compression/movement_sparsity/model_sparsity": 0.8979108129885207, + "compression_loss": 159.28213500976562, + "distillation_loss": 6.85511589050293, + "epoch": 1.8, + "learning_rate": 3.7139234391981645e-05, + "loss": 165.4317, + "step": 2130, + "task_loss": 3.002366304397583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4879695689071237, + "compression/movement_sparsity/importance_threshold": -3.660392924353245e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9299306933989334, + "compression/movement_sparsity/model_sparsity": 0.8979847132792501, + "compression_loss": 159.29702758789062, + "distillation_loss": 4.378478527069092, + "epoch": 1.8, + "learning_rate": 3.713319647385581e-05, + "loss": 165.1144, + "step": 2131, + "task_loss": 2.2784366607666016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4881212112004718, + "compression/movement_sparsity/importance_threshold": -3.614254064214419e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9300515925345935, + "compression/movement_sparsity/model_sparsity": 0.8981014591576709, + "compression_loss": 159.31187438964844, + "distillation_loss": 6.954366683959961, + "epoch": 1.8, + "learning_rate": 3.7127158555729986e-05, + "loss": 165.4568, + "step": 2132, + "task_loss": 3.723555564880371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.488271573819646, + "compression/movement_sparsity/importance_threshold": -3.56850455922467e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9302106490066883, + "compression/movement_sparsity/model_sparsity": 0.8982550515506343, + "compression_loss": 159.32643127441406, + "distillation_loss": 7.012324333190918, + "epoch": 1.8, + "learning_rate": 3.7121120637604153e-05, + "loss": 165.3813, + "step": 2133, + "task_loss": 3.294421434402466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4884206621869942, + "compression/movement_sparsity/importance_threshold": -3.5231427595741084e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9303314765973424, + "compression/movement_sparsity/model_sparsity": 0.8983717283418404, + "compression_loss": 159.34104919433594, + "distillation_loss": 7.716732025146484, + "epoch": 1.8, + "learning_rate": 3.711508271947833e-05, + "loss": 165.2608, + "step": 2134, + "task_loss": 3.2470779418945312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4885684817248646, + "compression/movement_sparsity/importance_threshold": -3.4781670154528446e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9304352884007803, + "compression/movement_sparsity/model_sparsity": 0.8984719738904676, + "compression_loss": 159.3553924560547, + "distillation_loss": 6.853826999664307, + "epoch": 1.8, + "learning_rate": 3.7109044801352495e-05, + "loss": 165.7967, + "step": 2135, + "task_loss": 3.518109083175659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4887150378556058, + "compression/movement_sparsity/importance_threshold": -3.433575677050729e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.930519914218492, + "compression/movement_sparsity/model_sparsity": 0.8985536925510015, + "compression_loss": 159.36978149414062, + "distillation_loss": 6.5251007080078125, + "epoch": 1.81, + "learning_rate": 3.710300688322667e-05, + "loss": 166.5522, + "step": 2136, + "task_loss": 3.2383108139038086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4888603360015658, + "compression/movement_sparsity/importance_threshold": -3.3893670945579595e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9305726905844482, + "compression/movement_sparsity/model_sparsity": 0.8986046558864281, + "compression_loss": 159.38400268554688, + "distillation_loss": 5.866753578186035, + "epoch": 1.81, + "learning_rate": 3.7096968965100836e-05, + "loss": 165.5232, + "step": 2137, + "task_loss": 3.39377498626709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4890043815850924, + "compression/movement_sparsity/importance_threshold": -3.345539618164559e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.930643233960182, + "compression/movement_sparsity/model_sparsity": 0.8986727758801887, + "compression_loss": 159.3979034423828, + "distillation_loss": 4.7557830810546875, + "epoch": 1.81, + "learning_rate": 3.7090931046975e-05, + "loss": 165.4998, + "step": 2138, + "task_loss": 3.002495288848877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4891471800285343, + "compression/movement_sparsity/importance_threshold": -3.302091598060466e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9306605001549187, + "compression/movement_sparsity/model_sparsity": 0.8986894489280192, + "compression_loss": 159.411865234375, + "distillation_loss": 7.329342842102051, + "epoch": 1.81, + "learning_rate": 3.708489312884918e-05, + "loss": 166.3008, + "step": 2139, + "task_loss": 3.2656755447387695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4892887367542396, + "compression/movement_sparsity/importance_threshold": -3.259021384435876e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9307779770544673, + "compression/movement_sparsity/model_sparsity": 0.898802890134667, + "compression_loss": 159.42555236816406, + "distillation_loss": 5.809759140014648, + "epoch": 1.81, + "learning_rate": 3.7078855210723344e-05, + "loss": 165.9357, + "step": 2140, + "task_loss": 2.366703987121582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4894290571845565, + "compression/movement_sparsity/importance_threshold": -3.216327327480728e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9309118496845151, + "compression/movement_sparsity/model_sparsity": 0.8989321638280323, + "compression_loss": 159.4390869140625, + "distillation_loss": 8.127060890197754, + "epoch": 1.81, + "learning_rate": 3.707281729259751e-05, + "loss": 166.01, + "step": 2141, + "task_loss": 3.5388832092285156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.489568146741833, + "compression/movement_sparsity/importance_threshold": -3.174007777385218e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9310097828733085, + "compression/movement_sparsity/model_sparsity": 0.8990267327105128, + "compression_loss": 159.45266723632812, + "distillation_loss": 7.221467971801758, + "epoch": 1.81, + "learning_rate": 3.7066779374471685e-05, + "loss": 166.0727, + "step": 2142, + "task_loss": 3.764788866043091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4897060108484173, + "compression/movement_sparsity/importance_threshold": -3.132061084339197e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.931155257718466, + "compression/movement_sparsity/model_sparsity": 0.8991672100472062, + "compression_loss": 159.46607971191406, + "distillation_loss": 6.659930229187012, + "epoch": 1.81, + "learning_rate": 3.706074145634585e-05, + "loss": 166.7952, + "step": 2143, + "task_loss": 2.9592795372009277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.489842654926658, + "compression/movement_sparsity/importance_threshold": -3.0904855985328626e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9312677622401104, + "compression/movement_sparsity/model_sparsity": 0.8992758496924278, + "compression_loss": 159.4792938232422, + "distillation_loss": 6.183413505554199, + "epoch": 1.81, + "learning_rate": 3.7054703538220026e-05, + "loss": 165.463, + "step": 2144, + "task_loss": 3.170867919921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4899780843989028, + "compression/movement_sparsity/importance_threshold": -3.0492796701561513e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9314213693675956, + "compression/movement_sparsity/model_sparsity": 0.899424179942533, + "compression_loss": 159.49249267578125, + "distillation_loss": 6.067525386810303, + "epoch": 1.81, + "learning_rate": 3.7048665620094194e-05, + "loss": 166.0803, + "step": 2145, + "task_loss": 2.969949245452881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4901123046875, + "compression/movement_sparsity/importance_threshold": -3.008441649399174e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9314494627065457, + "compression/movement_sparsity/model_sparsity": 0.899451308188865, + "compression_loss": 159.5055694580078, + "distillation_loss": 5.507415771484375, + "epoch": 1.81, + "learning_rate": 3.704262770196836e-05, + "loss": 165.7405, + "step": 2146, + "task_loss": 2.9966232776641846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.490245321214798, + "compression/movement_sparsity/importance_threshold": -2.9679698864519544e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9314968255003954, + "compression/movement_sparsity/model_sparsity": 0.8994970439250409, + "compression_loss": 159.5183868408203, + "distillation_loss": 5.309004306793213, + "epoch": 1.81, + "learning_rate": 3.7036589783842535e-05, + "loss": 165.4328, + "step": 2147, + "task_loss": 2.4411094188690186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4903771394031449, + "compression/movement_sparsity/importance_threshold": -2.9278627315046032e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.931572448571542, + "compression/movement_sparsity/model_sparsity": 0.8995700691110499, + "compression_loss": 159.5310821533203, + "distillation_loss": 6.944520950317383, + "epoch": 1.82, + "learning_rate": 3.70305518657167e-05, + "loss": 165.8476, + "step": 2148, + "task_loss": 3.641326665878296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.490507764674889, + "compression/movement_sparsity/importance_threshold": -2.888118534746971e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9316739709347936, + "compression/movement_sparsity/model_sparsity": 0.8996681038688046, + "compression_loss": 159.543701171875, + "distillation_loss": 6.9009246826171875, + "epoch": 1.82, + "learning_rate": 3.702451394759087e-05, + "loss": 165.7909, + "step": 2149, + "task_loss": 3.2321109771728516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4906372024523782, + "compression/movement_sparsity/importance_threshold": -2.8487356463693415e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9316792056443859, + "compression/movement_sparsity/model_sparsity": 0.8996731587500184, + "compression_loss": 159.55606079101562, + "distillation_loss": 5.472177505493164, + "epoch": 1.82, + "learning_rate": 3.701847602946504e-05, + "loss": 165.8256, + "step": 2150, + "task_loss": 2.0703155994415283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.490765458157961, + "compression/movement_sparsity/importance_threshold": -2.809712416561652e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9317664309306422, + "compression/movement_sparsity/model_sparsity": 0.8997573875793554, + "compression_loss": 159.56846618652344, + "distillation_loss": 6.139923572540283, + "epoch": 1.82, + "learning_rate": 3.701243811133921e-05, + "loss": 165.6216, + "step": 2151, + "task_loss": 3.5726733207702637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4908925372139854, + "compression/movement_sparsity/importance_threshold": -2.7710471955139267e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9318045763429093, + "compression/movement_sparsity/model_sparsity": 0.8997942225793621, + "compression_loss": 159.5805206298828, + "distillation_loss": 5.594900608062744, + "epoch": 1.82, + "learning_rate": 3.7006400193213384e-05, + "loss": 165.0827, + "step": 2152, + "task_loss": 2.7253451347351074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4910184450427997, + "compression/movement_sparsity/importance_threshold": -2.7327383334162757e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9317699127875918, + "compression/movement_sparsity/model_sparsity": 0.8997607498238075, + "compression_loss": 159.59254455566406, + "distillation_loss": 8.02879524230957, + "epoch": 1.82, + "learning_rate": 3.700036227508755e-05, + "loss": 166.3777, + "step": 2153, + "task_loss": 3.7876791954040527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.491143187066752, + "compression/movement_sparsity/importance_threshold": -2.694784180458723e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9317664190064745, + "compression/movement_sparsity/model_sparsity": 0.8997573760648196, + "compression_loss": 159.60446166992188, + "distillation_loss": 5.754787921905518, + "epoch": 1.82, + "learning_rate": 3.6994324356961725e-05, + "loss": 165.7267, + "step": 2154, + "task_loss": 2.83111310005188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4912667687081906, + "compression/movement_sparsity/importance_threshold": -2.657183086831206e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9319087339472085, + "compression/movement_sparsity/model_sparsity": 0.8998948020495275, + "compression_loss": 159.6162567138672, + "distillation_loss": 6.603320121765137, + "epoch": 1.82, + "learning_rate": 3.698828643883589e-05, + "loss": 166.9662, + "step": 2155, + "task_loss": 2.6097564697265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4913891953894636, + "compression/movement_sparsity/importance_threshold": -2.6199334027239216e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9319926323906945, + "compression/movement_sparsity/model_sparsity": 0.8999758183233779, + "compression_loss": 159.62791442871094, + "distillation_loss": 5.65250301361084, + "epoch": 1.82, + "learning_rate": 3.698224852071006e-05, + "loss": 165.6558, + "step": 2156, + "task_loss": 3.757375478744507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4915104725329194, + "compression/movement_sparsity/importance_threshold": -2.5830334783269807e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9320808473828646, + "compression/movement_sparsity/model_sparsity": 0.9000610028591859, + "compression_loss": 159.6395721435547, + "distillation_loss": 5.8403778076171875, + "epoch": 1.82, + "learning_rate": 3.6976210602584234e-05, + "loss": 165.4961, + "step": 2157, + "task_loss": 2.883249282836914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4916306055609059, + "compression/movement_sparsity/importance_threshold": -2.546481663830147e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9322117628193387, + "compression/movement_sparsity/model_sparsity": 0.9001874209476741, + "compression_loss": 159.6510467529297, + "distillation_loss": 6.101372241973877, + "epoch": 1.82, + "learning_rate": 3.69701726844584e-05, + "loss": 166.0448, + "step": 2158, + "task_loss": 3.9884471893310547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4917495998957713, + "compression/movement_sparsity/importance_threshold": -2.5102763094237042e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9322761056279019, + "compression/movement_sparsity/model_sparsity": 0.9002495533828215, + "compression_loss": 159.6622772216797, + "distillation_loss": 5.891214370727539, + "epoch": 1.82, + "learning_rate": 3.696413476633257e-05, + "loss": 166.0175, + "step": 2159, + "task_loss": 2.809208393096924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4918674609598641, + "compression/movement_sparsity/importance_threshold": -2.47441576529759e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9324273398460275, + "compression/movement_sparsity/model_sparsity": 0.9003955922403036, + "compression_loss": 159.67349243164062, + "distillation_loss": 7.285126209259033, + "epoch": 1.83, + "learning_rate": 3.695809684820674e-05, + "loss": 165.8108, + "step": 2160, + "task_loss": 3.6934750080108643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4919841941755325, + "compression/movement_sparsity/importance_threshold": -2.4388983816419144e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9324955937815752, + "compression/movement_sparsity/model_sparsity": 0.9004615014431916, + "compression_loss": 159.68466186523438, + "distillation_loss": 5.652138710021973, + "epoch": 1.83, + "learning_rate": 3.695205893008091e-05, + "loss": 165.9007, + "step": 2161, + "task_loss": 2.384229898452759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4920998049651244, + "compression/movement_sparsity/importance_threshold": -2.4037225086467016e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9325608547510462, + "compression/movement_sparsity/model_sparsity": 0.9005245204975951, + "compression_loss": 159.695556640625, + "distillation_loss": 7.637294769287109, + "epoch": 1.83, + "learning_rate": 3.6946021011955076e-05, + "loss": 166.3575, + "step": 2162, + "task_loss": 3.6324594020843506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4922142987509879, + "compression/movement_sparsity/importance_threshold": -2.3688864965019756e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9326347488178858, + "compression/movement_sparsity/model_sparsity": 0.900595876075914, + "compression_loss": 159.70648193359375, + "distillation_loss": 5.64243221282959, + "epoch": 1.83, + "learning_rate": 3.693998309382925e-05, + "loss": 165.5204, + "step": 2163, + "task_loss": 2.139094114303589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4923276809554715, + "compression/movement_sparsity/importance_threshold": -2.3343886953978467e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9326593722240538, + "compression/movement_sparsity/model_sparsity": 0.9006196535923296, + "compression_loss": 159.71719360351562, + "distillation_loss": 6.932564735412598, + "epoch": 1.83, + "learning_rate": 3.6933945175703424e-05, + "loss": 166.0483, + "step": 2164, + "task_loss": 2.399298906326294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4924399570009235, + "compression/movement_sparsity/importance_threshold": -2.3002274555241654e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9326743370544368, + "compression/movement_sparsity/model_sparsity": 0.9006341043347518, + "compression_loss": 159.72779846191406, + "distillation_loss": 8.339947700500488, + "epoch": 1.83, + "learning_rate": 3.6927907257577585e-05, + "loss": 166.4438, + "step": 2165, + "task_loss": 3.3859751224517822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4925511323096916, + "compression/movement_sparsity/importance_threshold": -2.2664011270713026e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.932698626583911, + "compression/movement_sparsity/model_sparsity": 0.9006575594441653, + "compression_loss": 159.7381591796875, + "distillation_loss": 4.974183082580566, + "epoch": 1.83, + "learning_rate": 3.692186933945176e-05, + "loss": 166.1561, + "step": 2166, + "task_loss": 3.0369067192077637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4926612123041245, + "compression/movement_sparsity/importance_threshold": -2.232908060229022e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9327527026841397, + "compression/movement_sparsity/model_sparsity": 0.9007097778639935, + "compression_loss": 159.74868774414062, + "distillation_loss": 5.794914722442627, + "epoch": 1.83, + "learning_rate": 3.691583142132593e-05, + "loss": 165.5491, + "step": 2167, + "task_loss": 2.7701401710510254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4927702024065703, + "compression/movement_sparsity/importance_threshold": -2.199746605187434e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9328387117052972, + "compression/movement_sparsity/model_sparsity": 0.9007928322106795, + "compression_loss": 159.75888061523438, + "distillation_loss": 8.15744400024414, + "epoch": 1.83, + "learning_rate": 3.690979350320009e-05, + "loss": 166.7437, + "step": 2168, + "task_loss": 4.08432674407959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4928781080393767, + "compression/movement_sparsity/importance_threshold": -2.1669151121367358e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9328935986489251, + "compression/movement_sparsity/model_sparsity": 0.9008458336189418, + "compression_loss": 159.76902770996094, + "distillation_loss": 7.673264980316162, + "epoch": 1.83, + "learning_rate": 3.690375558507427e-05, + "loss": 166.0066, + "step": 2169, + "task_loss": 4.269114971160889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4929849346248927, + "compression/movement_sparsity/importance_threshold": -2.1344119312667782e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9330171687981355, + "compression/movement_sparsity/model_sparsity": 0.9009651587533807, + "compression_loss": 159.7790985107422, + "distillation_loss": 5.9905266761779785, + "epoch": 1.83, + "learning_rate": 3.689771766694844e-05, + "loss": 166.3865, + "step": 2170, + "task_loss": 3.4131176471710205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.493090687585466, + "compression/movement_sparsity/importance_threshold": -2.1022354127677584e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9331158412853222, + "compression/movement_sparsity/model_sparsity": 0.9010604415370804, + "compression_loss": 159.7891387939453, + "distillation_loss": 4.596959114074707, + "epoch": 1.83, + "learning_rate": 3.689167974882261e-05, + "loss": 165.1221, + "step": 2171, + "task_loss": 2.5268735885620117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4931953723434446, + "compression/movement_sparsity/importance_threshold": -2.0703839068296134e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9331393199713972, + "compression/movement_sparsity/model_sparsity": 0.9010831136580599, + "compression_loss": 159.79879760742188, + "distillation_loss": 5.933595657348633, + "epoch": 1.84, + "learning_rate": 3.6885641830696775e-05, + "loss": 165.1783, + "step": 2172, + "task_loss": 2.5600225925445557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4932989943211772, + "compression/movement_sparsity/importance_threshold": -2.038855763642454e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9332004432546986, + "compression/movement_sparsity/model_sparsity": 0.9011421371685427, + "compression_loss": 159.80856323242188, + "distillation_loss": 5.872112274169922, + "epoch": 1.84, + "learning_rate": 3.687960391257095e-05, + "loss": 165.9194, + "step": 2173, + "task_loss": 2.7253353595733643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4934015589410117, + "compression/movement_sparsity/importance_threshold": -2.007649333396304e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.933262126973879, + "compression/movement_sparsity/model_sparsity": 0.9012017018622078, + "compression_loss": 159.8181610107422, + "distillation_loss": 5.918325424194336, + "epoch": 1.84, + "learning_rate": 3.6873565994445116e-05, + "loss": 165.9345, + "step": 2174, + "task_loss": 3.452815055847168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4935030716252964, + "compression/movement_sparsity/importance_threshold": -1.976762966281187e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9333678704924738, + "compression/movement_sparsity/model_sparsity": 0.9013038127656339, + "compression_loss": 159.82754516601562, + "distillation_loss": 7.002851486206055, + "epoch": 1.84, + "learning_rate": 3.6867528076319284e-05, + "loss": 166.2592, + "step": 2175, + "task_loss": 3.4002463817596436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4936035377963792, + "compression/movement_sparsity/importance_threshold": -1.946195012487214e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9334123476377556, + "compression/movement_sparsity/model_sparsity": 0.9013467619841475, + "compression_loss": 159.83685302734375, + "distillation_loss": 5.348321437835693, + "epoch": 1.84, + "learning_rate": 3.686149015819346e-05, + "loss": 165.6941, + "step": 2176, + "task_loss": 3.303271770477295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4937029628766088, + "compression/movement_sparsity/importance_threshold": -1.9159438222043218e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9335365378436832, + "compression/movement_sparsity/model_sparsity": 0.9014666858744477, + "compression_loss": 159.84608459472656, + "distillation_loss": 7.034958839416504, + "epoch": 1.84, + "learning_rate": 3.685545224006763e-05, + "loss": 166.2113, + "step": 2177, + "task_loss": 3.7367100715637207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.493801352288333, + "compression/movement_sparsity/importance_threshold": -1.8860077456226212e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9335566300661495, + "compression/movement_sparsity/model_sparsity": 0.9014860878672615, + "compression_loss": 159.85511779785156, + "distillation_loss": 6.940305709838867, + "epoch": 1.84, + "learning_rate": 3.684941432194179e-05, + "loss": 167.359, + "step": 2178, + "task_loss": 3.855077028274536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4938987114539004, + "compression/movement_sparsity/importance_threshold": -1.856385132932223e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9336910035112381, + "compression/movement_sparsity/model_sparsity": 0.9016158451711301, + "compression_loss": 159.86415100097656, + "distillation_loss": 7.866343021392822, + "epoch": 1.84, + "learning_rate": 3.6843376403815966e-05, + "loss": 166.2573, + "step": 2179, + "task_loss": 3.4499807357788086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4939950457956588, + "compression/movement_sparsity/importance_threshold": -1.8270743343230637e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9336758001975024, + "compression/movement_sparsity/model_sparsity": 0.9016011641379921, + "compression_loss": 159.8729248046875, + "distillation_loss": 5.8079681396484375, + "epoch": 1.84, + "learning_rate": 3.683733848569014e-05, + "loss": 165.4043, + "step": 2180, + "task_loss": 3.4423396587371826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4940903607359564, + "compression/movement_sparsity/importance_threshold": -1.7980736999852545e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9337295185727019, + "compression/movement_sparsity/model_sparsity": 0.9016530371217465, + "compression_loss": 159.88168334960938, + "distillation_loss": 6.328896522521973, + "epoch": 1.84, + "learning_rate": 3.68313005675643e-05, + "loss": 166.4665, + "step": 2181, + "task_loss": 4.046377182006836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4941846616971417, + "compression/movement_sparsity/importance_threshold": -1.7693815801087323e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9337566699024088, + "compression/movement_sparsity/model_sparsity": 0.9016792557197507, + "compression_loss": 159.89035034179688, + "distillation_loss": 8.63490104675293, + "epoch": 1.84, + "learning_rate": 3.6825262649438474e-05, + "loss": 166.7287, + "step": 2182, + "task_loss": 3.969119071960449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4942779541015625, + "compression/movement_sparsity/importance_threshold": -1.7409963248837812e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9337952803572137, + "compression/movement_sparsity/model_sparsity": 0.9017165397866533, + "compression_loss": 159.89894104003906, + "distillation_loss": 5.827761650085449, + "epoch": 1.84, + "learning_rate": 3.681922473131265e-05, + "loss": 166.188, + "step": 2183, + "task_loss": 3.3331120014190674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4943702433715673, + "compression/movement_sparsity/importance_threshold": -1.7129162845002516e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9338439548095033, + "compression/movement_sparsity/model_sparsity": 0.9017635421217667, + "compression_loss": 159.90733337402344, + "distillation_loss": 8.213016510009766, + "epoch": 1.85, + "learning_rate": 3.6813186813186815e-05, + "loss": 165.9767, + "step": 2184, + "task_loss": 3.0984020233154297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4944615349295043, + "compression/movement_sparsity/importance_threshold": -1.6851398091481673e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9339939608383625, + "compression/movement_sparsity/model_sparsity": 0.901908394982062, + "compression_loss": 159.91578674316406, + "distillation_loss": 6.798361778259277, + "epoch": 1.85, + "learning_rate": 3.680714889506098e-05, + "loss": 165.5089, + "step": 2185, + "task_loss": 2.7990546226501465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4945518341977215, + "compression/movement_sparsity/importance_threshold": -1.6576652490177257e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9340489789478343, + "compression/movement_sparsity/model_sparsity": 0.901961523050218, + "compression_loss": 159.9240264892578, + "distillation_loss": 7.7666497230529785, + "epoch": 1.85, + "learning_rate": 3.6801110976935156e-05, + "loss": 166.8684, + "step": 2186, + "task_loss": 3.7062816619873047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4946411465985672, + "compression/movement_sparsity/importance_threshold": -1.630490954298864e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9341234811472232, + "compression/movement_sparsity/model_sparsity": 0.9020334658698623, + "compression_loss": 159.9322509765625, + "distillation_loss": 7.355094909667969, + "epoch": 1.85, + "learning_rate": 3.6795073058809324e-05, + "loss": 166.999, + "step": 2187, + "task_loss": 2.447727918624878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4947294775543898, + "compression/movement_sparsity/importance_threshold": -1.6036152751816926e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9342715554609253, + "compression/movement_sparsity/model_sparsity": 0.9021764533753589, + "compression_loss": 159.9402313232422, + "distillation_loss": 8.737361907958984, + "epoch": 1.85, + "learning_rate": 3.678903514068349e-05, + "loss": 166.4723, + "step": 2188, + "task_loss": 3.3702759742736816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.494816832487537, + "compression/movement_sparsity/importance_threshold": -1.5770365618561488e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9343446983052037, + "compression/movement_sparsity/model_sparsity": 0.9022470835379226, + "compression_loss": 159.94825744628906, + "distillation_loss": 9.457321166992188, + "epoch": 1.85, + "learning_rate": 3.6782997222557665e-05, + "loss": 166.8252, + "step": 2189, + "task_loss": 3.891730546951294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4949032168203573, + "compression/movement_sparsity/importance_threshold": -1.55075316451243e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9343520078199645, + "compression/movement_sparsity/model_sparsity": 0.9022541419483647, + "compression_loss": 159.95620727539062, + "distillation_loss": 6.379671573638916, + "epoch": 1.85, + "learning_rate": 3.677695930443183e-05, + "loss": 165.8596, + "step": 2190, + "task_loss": 3.244758367538452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.494988635975199, + "compression/movement_sparsity/importance_threshold": -1.524763433340473e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.934434153410808, + "compression/movement_sparsity/model_sparsity": 0.9023334655854532, + "compression_loss": 159.96400451660156, + "distillation_loss": 6.004927158355713, + "epoch": 1.85, + "learning_rate": 3.6770921386306e-05, + "loss": 165.9915, + "step": 2191, + "task_loss": 2.917937755584717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4950730953744102, + "compression/movement_sparsity/importance_threshold": -1.4990657185303888e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9345362481341063, + "compression/movement_sparsity/model_sparsity": 0.9024320530409261, + "compression_loss": 159.9718475341797, + "distillation_loss": 8.180635452270508, + "epoch": 1.85, + "learning_rate": 3.676488346818017e-05, + "loss": 166.3648, + "step": 2192, + "task_loss": 3.0410654544830322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.495156600440339, + "compression/movement_sparsity/importance_threshold": -1.4736583702721143e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9345853160839278, + "compression/movement_sparsity/model_sparsity": 0.9024794353557207, + "compression_loss": 159.9793701171875, + "distillation_loss": 5.616904258728027, + "epoch": 1.85, + "learning_rate": 3.675884555005435e-05, + "loss": 166.4426, + "step": 2193, + "task_loss": 2.4470934867858887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4952391565953338, + "compression/movement_sparsity/importance_threshold": -1.4485397387557601e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9345914212577574, + "compression/movement_sparsity/model_sparsity": 0.9024853307980475, + "compression_loss": 159.98690795898438, + "distillation_loss": 7.2730207443237305, + "epoch": 1.85, + "learning_rate": 3.6752807631928514e-05, + "loss": 166.5278, + "step": 2194, + "task_loss": 2.7994096279144287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4953207692617425, + "compression/movement_sparsity/importance_threshold": -1.4237081741714369e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9347345470418906, + "compression/movement_sparsity/model_sparsity": 0.9026235397711894, + "compression_loss": 159.99436950683594, + "distillation_loss": 6.690430164337158, + "epoch": 1.85, + "learning_rate": 3.674676971380268e-05, + "loss": 166.0103, + "step": 2195, + "task_loss": 3.372398853302002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4954014438619134, + "compression/movement_sparsity/importance_threshold": -1.3991620267091685e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9347729667100134, + "compression/movement_sparsity/model_sparsity": 0.9026606396055193, + "compression_loss": 160.00177001953125, + "distillation_loss": 4.4670562744140625, + "epoch": 1.86, + "learning_rate": 3.6740731795676855e-05, + "loss": 165.965, + "step": 2196, + "task_loss": 2.589193105697632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.495481185818195, + "compression/movement_sparsity/importance_threshold": -1.3748996465588921e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9349178214984538, + "compression/movement_sparsity/model_sparsity": 0.9028005181863514, + "compression_loss": 160.009033203125, + "distillation_loss": 5.434789657592773, + "epoch": 1.86, + "learning_rate": 3.673469387755102e-05, + "loss": 166.0112, + "step": 2197, + "task_loss": 2.8077433109283447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4955600005529348, + "compression/movement_sparsity/importance_threshold": -1.350919383910805e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9349239624447864, + "compression/movement_sparsity/model_sparsity": 0.9028064481722856, + "compression_loss": 160.0162811279297, + "distillation_loss": 7.008647918701172, + "epoch": 1.86, + "learning_rate": 3.672865595942519e-05, + "loss": 165.868, + "step": 2198, + "task_loss": 3.6527347564697266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4956378934884818, + "compression/movement_sparsity/importance_threshold": -1.3272195889547575e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9349464156524446, + "compression/movement_sparsity/model_sparsity": 0.9028281300431867, + "compression_loss": 160.02330017089844, + "distillation_loss": 7.055698871612549, + "epoch": 1.86, + "learning_rate": 3.6722618041299364e-05, + "loss": 166.1398, + "step": 2199, + "task_loss": 3.82051420211792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4957148700471836, + "compression/movement_sparsity/importance_threshold": -1.3037986118810338e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9349829632262486, + "compression/movement_sparsity/model_sparsity": 0.902863422095397, + "compression_loss": 160.03028869628906, + "distillation_loss": 6.413025379180908, + "epoch": 1.86, + "learning_rate": 3.671658012317353e-05, + "loss": 167.4812, + "step": 2200, + "task_loss": 3.40012788772583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.495790935651389, + "compression/movement_sparsity/importance_threshold": -1.280654802879571e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9349892949592632, + "compression/movement_sparsity/model_sparsity": 0.902869536313904, + "compression_loss": 160.0372772216797, + "distillation_loss": 6.814481735229492, + "epoch": 1.86, + "learning_rate": 3.67105422050477e-05, + "loss": 166.0952, + "step": 2201, + "task_loss": 2.657379627227783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4958660957234455, + "compression/movement_sparsity/importance_threshold": -1.2577865121403062e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9350578946956724, + "compression/movement_sparsity/model_sparsity": 0.9029357794383299, + "compression_loss": 160.0441131591797, + "distillation_loss": 7.139746189117432, + "epoch": 1.86, + "learning_rate": 3.670450428692187e-05, + "loss": 165.9225, + "step": 2202, + "task_loss": 3.4421777725219727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4959403556857018, + "compression/movement_sparsity/importance_threshold": -1.2351920898535235e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9351021214334337, + "compression/movement_sparsity/model_sparsity": 0.9029784868515919, + "compression_loss": 160.0509796142578, + "distillation_loss": 6.312686920166016, + "epoch": 1.86, + "learning_rate": 3.669846636879604e-05, + "loss": 166.4353, + "step": 2203, + "task_loss": 2.844438076019287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4960137209605058, + "compression/movement_sparsity/importance_threshold": -1.2128698862090732e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9351634235792498, + "compression/movement_sparsity/model_sparsity": 0.9030376830801116, + "compression_loss": 160.05775451660156, + "distillation_loss": 6.673650741577148, + "epoch": 1.86, + "learning_rate": 3.669242845067021e-05, + "loss": 166.7881, + "step": 2204, + "task_loss": 4.141193389892578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4960861969702057, + "compression/movement_sparsity/importance_threshold": -1.1908182513969794e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9352639562365876, + "compression/movement_sparsity/model_sparsity": 0.9031347621313954, + "compression_loss": 160.06434631347656, + "distillation_loss": 7.133143424987793, + "epoch": 1.86, + "learning_rate": 3.668639053254438e-05, + "loss": 165.8784, + "step": 2205, + "task_loss": 3.424313545227051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.49615778913715, + "compression/movement_sparsity/importance_threshold": -1.1690355356074392e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9353431804063603, + "compression/movement_sparsity/model_sparsity": 0.9032112647072144, + "compression_loss": 160.07080078125, + "distillation_loss": 6.824629783630371, + "epoch": 1.86, + "learning_rate": 3.668035261441855e-05, + "loss": 166.4237, + "step": 2206, + "task_loss": 3.374077320098877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4962285028836866, + "compression/movement_sparsity/importance_threshold": -1.1475200890304765e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9353846526613979, + "compression/movement_sparsity/model_sparsity": 0.9032513122627077, + "compression_loss": 160.07717895507812, + "distillation_loss": 6.282980442047119, + "epoch": 1.87, + "learning_rate": 3.667431469629272e-05, + "loss": 166.5712, + "step": 2207, + "task_loss": 3.4375152587890625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.496298343632164, + "compression/movement_sparsity/importance_threshold": -1.1262702618559418e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9354241455046078, + "compression/movement_sparsity/model_sparsity": 0.9032894484052593, + "compression_loss": 160.08355712890625, + "distillation_loss": 7.188577651977539, + "epoch": 1.87, + "learning_rate": 3.666827677816689e-05, + "loss": 167.0093, + "step": 2208, + "task_loss": 2.6003332138061523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4963673168049298, + "compression/movement_sparsity/importance_threshold": -1.1052844042742059e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9354818227034626, + "compression/movement_sparsity/model_sparsity": 0.9033451442148974, + "compression_loss": 160.0897216796875, + "distillation_loss": 6.781490325927734, + "epoch": 1.87, + "learning_rate": 3.666223886004106e-05, + "loss": 166.8658, + "step": 2209, + "task_loss": 4.086533069610596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4964354278243328, + "compression/movement_sparsity/importance_threshold": -1.0845608664750324e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9355251432044837, + "compression/movement_sparsity/model_sparsity": 0.903386976523439, + "compression_loss": 160.0957489013672, + "distillation_loss": 7.534673690795898, + "epoch": 1.87, + "learning_rate": 3.665620094191523e-05, + "loss": 167.4478, + "step": 2210, + "task_loss": 3.690523624420166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4965026821127212, + "compression/movement_sparsity/importance_threshold": -1.0640979986486186e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9356038069383774, + "compression/movement_sparsity/model_sparsity": 0.9034629379160756, + "compression_loss": 160.1017303466797, + "distillation_loss": 6.992488384246826, + "epoch": 1.87, + "learning_rate": 3.66501630237894e-05, + "loss": 166.7302, + "step": 2211, + "task_loss": 2.533961534500122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4965690850924427, + "compression/movement_sparsity/importance_threshold": -1.0438941509849885e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9356397344554643, + "compression/movement_sparsity/model_sparsity": 0.9034976312124245, + "compression_loss": 160.10768127441406, + "distillation_loss": 5.548211574554443, + "epoch": 1.87, + "learning_rate": 3.664412510566357e-05, + "loss": 166.831, + "step": 2212, + "task_loss": 2.105015277862549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4966346421858459, + "compression/movement_sparsity/importance_threshold": -1.023947673674079e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9357597392785516, + "compression/movement_sparsity/model_sparsity": 0.9036135135006608, + "compression_loss": 160.11325073242188, + "distillation_loss": 5.13206672668457, + "epoch": 1.87, + "learning_rate": 3.663808718753774e-05, + "loss": 165.3058, + "step": 2213, + "task_loss": 2.5881645679473877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4966993588152788, + "compression/movement_sparsity/importance_threshold": -1.0042569169060878e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9357917318203186, + "compression/movement_sparsity/model_sparsity": 0.9036444070001975, + "compression_loss": 160.1190643310547, + "distillation_loss": 7.204558372497559, + "epoch": 1.87, + "learning_rate": 3.663204926941191e-05, + "loss": 166.0462, + "step": 2214, + "task_loss": 3.9671390056610107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4967632404030895, + "compression/movement_sparsity/importance_threshold": -9.848202308709517e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9359343448652435, + "compression/movement_sparsity/model_sparsity": 0.9037821208483002, + "compression_loss": 160.1246795654297, + "distillation_loss": 7.157886505126953, + "epoch": 1.87, + "learning_rate": 3.662601135128608e-05, + "loss": 166.1892, + "step": 2215, + "task_loss": 2.864903211593628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4968262923716265, + "compression/movement_sparsity/importance_threshold": -9.656359657587814e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9358821289351661, + "compression/movement_sparsity/model_sparsity": 0.903731698696056, + "compression_loss": 160.13018798828125, + "distillation_loss": 8.18702220916748, + "epoch": 1.87, + "learning_rate": 3.6619973433160246e-05, + "loss": 166.8093, + "step": 2216, + "task_loss": 3.9553165435791016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.496888520143238, + "compression/movement_sparsity/importance_threshold": -9.467024717596008e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9359327708751156, + "compression/movement_sparsity/model_sparsity": 0.9037806009295754, + "compression_loss": 160.13563537597656, + "distillation_loss": 6.0620527267456055, + "epoch": 1.87, + "learning_rate": 3.661393551503442e-05, + "loss": 165.3558, + "step": 2217, + "task_loss": 2.769505500793457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4969499291402717, + "compression/movement_sparsity/importance_threshold": -9.280180990635205e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9360137002008602, + "compression/movement_sparsity/model_sparsity": 0.903858750084013, + "compression_loss": 160.1409912109375, + "distillation_loss": 5.198545932769775, + "epoch": 1.87, + "learning_rate": 3.660789759690859e-05, + "loss": 165.9041, + "step": 2218, + "task_loss": 2.987011671066284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4970105247850765, + "compression/movement_sparsity/importance_threshold": -9.095811978603908e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9361177385634831, + "compression/movement_sparsity/model_sparsity": 0.9039592144088203, + "compression_loss": 160.14625549316406, + "distillation_loss": 5.245521545410156, + "epoch": 1.88, + "learning_rate": 3.6601859678782755e-05, + "loss": 166.9062, + "step": 2219, + "task_loss": 2.909696578979492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4970703125, + "compression/movement_sparsity/importance_threshold": -8.91390118340496e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9361651252056681, + "compression/movement_sparsity/model_sparsity": 0.9040049731740678, + "compression_loss": 160.15145874023438, + "distillation_loss": 5.957149505615234, + "epoch": 1.88, + "learning_rate": 3.659582176065693e-05, + "loss": 166.0536, + "step": 2220, + "task_loss": 2.8713061809539795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4971292977073907, + "compression/movement_sparsity/importance_threshold": -8.73443210693773e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9361943274922083, + "compression/movement_sparsity/model_sparsity": 0.9040331722722287, + "compression_loss": 160.15650939941406, + "distillation_loss": 6.874434471130371, + "epoch": 1.88, + "learning_rate": 3.6589783842531096e-05, + "loss": 166.494, + "step": 2221, + "task_loss": 2.6459550857543945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4971874858295968, + "compression/movement_sparsity/importance_threshold": -8.557388251102457e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9362388642583283, + "compression/movement_sparsity/model_sparsity": 0.9040761790634212, + "compression_loss": 160.1615753173828, + "distillation_loss": 6.250699996948242, + "epoch": 1.88, + "learning_rate": 3.658374592440526e-05, + "loss": 166.2379, + "step": 2222, + "task_loss": 3.9180636405944824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4972448822889661, + "compression/movement_sparsity/importance_threshold": -8.382753117800248e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9363022292851452, + "compression/movement_sparsity/model_sparsity": 0.9041373673066334, + "compression_loss": 160.16648864746094, + "distillation_loss": 5.316680908203125, + "epoch": 1.88, + "learning_rate": 3.657770800627944e-05, + "loss": 166.1566, + "step": 2223, + "task_loss": 3.8898046016693115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4973014925078474, + "compression/movement_sparsity/importance_threshold": -8.210510208930474e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9364030600466741, + "compression/movement_sparsity/model_sparsity": 0.9042347342213121, + "compression_loss": 160.17127990722656, + "distillation_loss": 6.536879539489746, + "epoch": 1.88, + "learning_rate": 3.657167008815361e-05, + "loss": 166.464, + "step": 2224, + "task_loss": 4.587782859802246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4973573219085887, + "compression/movement_sparsity/importance_threshold": -8.040643026394241e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9364950788483201, + "compression/movement_sparsity/model_sparsity": 0.9043235918940385, + "compression_loss": 160.176025390625, + "distillation_loss": 7.860706329345703, + "epoch": 1.88, + "learning_rate": 3.656563217002778e-05, + "loss": 167.0165, + "step": 2225, + "task_loss": 3.4239118099212646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4974123759135378, + "compression/movement_sparsity/importance_threshold": -7.873135072092655e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9365924396770668, + "compression/movement_sparsity/model_sparsity": 0.904417608078801, + "compression_loss": 160.18069458007812, + "distillation_loss": 5.045379638671875, + "epoch": 1.88, + "learning_rate": 3.6559594251901945e-05, + "loss": 166.1685, + "step": 2226, + "task_loss": 2.22676682472229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4974666599450435, + "compression/movement_sparsity/importance_threshold": -7.707969847925086e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.936577272135834, + "compression/movement_sparsity/model_sparsity": 0.9044029615892704, + "compression_loss": 160.1852569580078, + "distillation_loss": 5.71860408782959, + "epoch": 1.88, + "learning_rate": 3.655355633377612e-05, + "loss": 165.9178, + "step": 2227, + "task_loss": 2.1781108379364014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4975201794254536, + "compression/movement_sparsity/importance_threshold": -7.5451308557935096e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9367110612967084, + "compression/movement_sparsity/model_sparsity": 0.9045321546808851, + "compression_loss": 160.18972778320312, + "distillation_loss": 6.056445121765137, + "epoch": 1.88, + "learning_rate": 3.6547518415650287e-05, + "loss": 166.8065, + "step": 2228, + "task_loss": 3.6674699783325195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4975729397771163, + "compression/movement_sparsity/importance_threshold": -7.384601597596428e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9367594137964719, + "compression/movement_sparsity/model_sparsity": 0.9045788461235319, + "compression_loss": 160.19427490234375, + "distillation_loss": 4.525057315826416, + "epoch": 1.88, + "learning_rate": 3.6541480497524454e-05, + "loss": 166.3232, + "step": 2229, + "task_loss": 2.363933801651001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4976249464223799, + "compression/movement_sparsity/importance_threshold": -7.226365575235816e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9367685715572162, + "compression/movement_sparsity/model_sparsity": 0.9045876892870222, + "compression_loss": 160.1986083984375, + "distillation_loss": 8.245214462280273, + "epoch": 1.88, + "learning_rate": 3.653544257939863e-05, + "loss": 166.9164, + "step": 2230, + "task_loss": 4.12413215637207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4976762047835925, + "compression/movement_sparsity/importance_threshold": -7.0704062906110435e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9368000871322778, + "compression/movement_sparsity/model_sparsity": 0.904618122205127, + "compression_loss": 160.20294189453125, + "distillation_loss": 7.933854579925537, + "epoch": 1.89, + "learning_rate": 3.6529404661272795e-05, + "loss": 166.4808, + "step": 2231, + "task_loss": 3.388575315475464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4977267202831026, + "compression/movement_sparsity/importance_threshold": -6.916707245623217e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9368373859286427, + "compression/movement_sparsity/model_sparsity": 0.9046541396730924, + "compression_loss": 160.20712280273438, + "distillation_loss": 6.536538124084473, + "epoch": 1.89, + "learning_rate": 3.652336674314696e-05, + "loss": 166.948, + "step": 2232, + "task_loss": 2.6000890731811523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4977764983432582, + "compression/movement_sparsity/importance_threshold": -6.765251942172576e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9368213717715078, + "compression/movement_sparsity/model_sparsity": 0.9046386756515203, + "compression_loss": 160.21136474609375, + "distillation_loss": 5.801495552062988, + "epoch": 1.89, + "learning_rate": 3.6517328825021136e-05, + "loss": 166.8858, + "step": 2233, + "task_loss": 2.9945945739746094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4978255443864072, + "compression/movement_sparsity/importance_threshold": -6.6160238821602255e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.936865276556743, + "compression/movement_sparsity/model_sparsity": 0.9046810721723159, + "compression_loss": 160.21539306640625, + "distillation_loss": 5.843520164489746, + "epoch": 1.89, + "learning_rate": 3.651129090689531e-05, + "loss": 166.1308, + "step": 2234, + "task_loss": 2.2337327003479004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4978738638348983, + "compression/movement_sparsity/importance_threshold": -6.469006567485537e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9369365711550378, + "compression/movement_sparsity/model_sparsity": 0.9047499175818314, + "compression_loss": 160.2193145751953, + "distillation_loss": 5.953176498413086, + "epoch": 1.89, + "learning_rate": 3.650525298876947e-05, + "loss": 166.6612, + "step": 2235, + "task_loss": 2.551196813583374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4979214621110792, + "compression/movement_sparsity/importance_threshold": -6.324183500049617e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9369228702864242, + "compression/movement_sparsity/model_sparsity": 0.9047366873802035, + "compression_loss": 160.2232666015625, + "distillation_loss": 7.417908668518066, + "epoch": 1.89, + "learning_rate": 3.6499215070643644e-05, + "loss": 166.7264, + "step": 2236, + "task_loss": 3.511472463607788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4979683446372987, + "compression/movement_sparsity/importance_threshold": -6.181538181753571e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.936976517116618, + "compression/movement_sparsity/model_sparsity": 0.9047884912767431, + "compression_loss": 160.22714233398438, + "distillation_loss": 5.96925163269043, + "epoch": 1.89, + "learning_rate": 3.649317715251782e-05, + "loss": 166.5463, + "step": 2237, + "task_loss": 3.3122780323028564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4980145168359045, + "compression/movement_sparsity/importance_threshold": -6.041054114495903e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9370168208032272, + "compression/movement_sparsity/model_sparsity": 0.9048274104077286, + "compression_loss": 160.23094177246094, + "distillation_loss": 6.831334114074707, + "epoch": 1.89, + "learning_rate": 3.648713923439198e-05, + "loss": 167.4659, + "step": 2238, + "task_loss": 3.7148828506469727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4980599841292448, + "compression/movement_sparsity/importance_threshold": -5.902714800179454e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9371547357261036, + "compression/movement_sparsity/model_sparsity": 0.9049605875287283, + "compression_loss": 160.2347412109375, + "distillation_loss": 6.468693256378174, + "epoch": 1.89, + "learning_rate": 3.648110131626615e-05, + "loss": 166.6892, + "step": 2239, + "task_loss": 3.4829370975494385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498104751939668, + "compression/movement_sparsity/importance_threshold": -5.7665037407027275e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9372125560149699, + "compression/movement_sparsity/model_sparsity": 0.905016421512796, + "compression_loss": 160.23838806152344, + "distillation_loss": 6.69094181060791, + "epoch": 1.89, + "learning_rate": 3.647506339814033e-05, + "loss": 166.6392, + "step": 2240, + "task_loss": 3.3223018646240234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4981488256895221, + "compression/movement_sparsity/importance_threshold": -5.63240443796683e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.937311610075521, + "compression/movement_sparsity/model_sparsity": 0.9051120727616413, + "compression_loss": 160.24195861816406, + "distillation_loss": 8.05595588684082, + "epoch": 1.89, + "learning_rate": 3.646902548001449e-05, + "loss": 167.0704, + "step": 2241, + "task_loss": 3.416008472442627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4981922108011556, + "compression/movement_sparsity/importance_threshold": -5.500400393873735e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9373872331466676, + "compression/movement_sparsity/model_sparsity": 0.9051850979476503, + "compression_loss": 160.245361328125, + "distillation_loss": 6.689817428588867, + "epoch": 1.89, + "learning_rate": 3.646298756188866e-05, + "loss": 166.2641, + "step": 2242, + "task_loss": 3.636019229888916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4982349126969166, + "compression/movement_sparsity/importance_threshold": -5.370475110321078e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9374037123463405, + "compression/movement_sparsity/model_sparsity": 0.9052010110361184, + "compression_loss": 160.24884033203125, + "distillation_loss": 7.328866004943848, + "epoch": 1.9, + "learning_rate": 3.6456949643762835e-05, + "loss": 166.4979, + "step": 2243, + "task_loss": 3.576991081237793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498276936799153, + "compression/movement_sparsity/importance_threshold": -5.242612089210834e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9374651456580003, + "compression/movement_sparsity/model_sparsity": 0.9052603339245319, + "compression_loss": 160.2522735595703, + "distillation_loss": 6.623807907104492, + "epoch": 1.9, + "learning_rate": 3.6450911725637e-05, + "loss": 167.2635, + "step": 2244, + "task_loss": 2.831836462020874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4983182885302133, + "compression/movement_sparsity/importance_threshold": -5.116794832444108e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9375227155393463, + "compression/movement_sparsity/model_sparsity": 0.9053159261033479, + "compression_loss": 160.25572204589844, + "distillation_loss": 6.56038761138916, + "epoch": 1.9, + "learning_rate": 3.644487380751117e-05, + "loss": 166.1317, + "step": 2245, + "task_loss": 2.967102527618408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4983589733124458, + "compression/movement_sparsity/importance_threshold": -4.993006841920271e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9375944990285142, + "compression/movement_sparsity/model_sparsity": 0.905385243608831, + "compression_loss": 160.2588348388672, + "distillation_loss": 7.125331878662109, + "epoch": 1.9, + "learning_rate": 3.643883588938534e-05, + "loss": 167.2203, + "step": 2246, + "task_loss": 4.350142002105713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498398996568198, + "compression/movement_sparsity/importance_threshold": -4.8712316195404295e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9376362932360779, + "compression/movement_sparsity/model_sparsity": 0.9054256020567909, + "compression_loss": 160.26210021972656, + "distillation_loss": 4.466840744018555, + "epoch": 1.9, + "learning_rate": 3.643279797125951e-05, + "loss": 165.5341, + "step": 2247, + "task_loss": 2.2815232276916504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498438363719819, + "compression/movement_sparsity/importance_threshold": -4.751452667204822e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9377269288342781, + "compression/movement_sparsity/model_sparsity": 0.9055131240433651, + "compression_loss": 160.26519775390625, + "distillation_loss": 5.584445953369141, + "epoch": 1.9, + "learning_rate": 3.642676005313368e-05, + "loss": 166.7658, + "step": 2248, + "task_loss": 3.8877828121185303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4984770801896568, + "compression/movement_sparsity/importance_threshold": -4.633653486813688e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9377823523654496, + "compression/movement_sparsity/model_sparsity": 0.9055666436057382, + "compression_loss": 160.26832580566406, + "distillation_loss": 8.422534942626953, + "epoch": 1.9, + "learning_rate": 3.642072213500785e-05, + "loss": 167.4358, + "step": 2249, + "task_loss": 3.821789026260376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498515151400059, + "compression/movement_sparsity/importance_threshold": -4.5178175802681325e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9378435948904273, + "compression/movement_sparsity/model_sparsity": 0.905625782261579, + "compression_loss": 160.2713165283203, + "distillation_loss": 7.25454568862915, + "epoch": 1.9, + "learning_rate": 3.6414684216882026e-05, + "loss": 166.5585, + "step": 2250, + "task_loss": 4.325481414794922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4985525827733743, + "compression/movement_sparsity/importance_threshold": -4.403928449467527e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9378312414527566, + "compression/movement_sparsity/model_sparsity": 0.9056138532024958, + "compression_loss": 160.27426147460938, + "distillation_loss": 6.13362979888916, + "epoch": 1.9, + "learning_rate": 3.6408646298756186e-05, + "loss": 166.109, + "step": 2251, + "task_loss": 2.4424173831939697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4985893797319507, + "compression/movement_sparsity/importance_threshold": -4.291969596313845e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9378494138842337, + "compression/movement_sparsity/model_sparsity": 0.9056314013550467, + "compression_loss": 160.2769775390625, + "distillation_loss": 6.943127155303955, + "epoch": 1.9, + "learning_rate": 3.640260838063036e-05, + "loss": 166.8966, + "step": 2252, + "task_loss": 2.503864288330078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4986255476981365, + "compression/movement_sparsity/importance_threshold": -4.181924522705591e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9378513336752231, + "compression/movement_sparsity/model_sparsity": 0.9056332551953096, + "compression_loss": 160.27963256835938, + "distillation_loss": 6.693708419799805, + "epoch": 1.9, + "learning_rate": 3.6396570462504534e-05, + "loss": 166.0906, + "step": 2253, + "task_loss": 3.5935592651367188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.49866109209428, + "compression/movement_sparsity/importance_threshold": -4.073776730544737e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9379069122205739, + "compression/movement_sparsity/model_sparsity": 0.905686924446648, + "compression_loss": 160.28225708007812, + "distillation_loss": 6.474967956542969, + "epoch": 1.9, + "learning_rate": 3.63905325443787e-05, + "loss": 165.6588, + "step": 2254, + "task_loss": 2.2397470474243164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4986960183427291, + "compression/movement_sparsity/importance_threshold": -3.967509721731523e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9379576495538644, + "compression/movement_sparsity/model_sparsity": 0.9057359187964538, + "compression_loss": 160.28485107421875, + "distillation_loss": 6.957125663757324, + "epoch": 1.91, + "learning_rate": 3.638449462625287e-05, + "loss": 166.6282, + "step": 2255, + "task_loss": 3.5468225479125977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4987303318658323, + "compression/movement_sparsity/importance_threshold": -3.863106998166187e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9380159587336038, + "compression/movement_sparsity/model_sparsity": 0.905792224876489, + "compression_loss": 160.2872314453125, + "distillation_loss": 6.309211730957031, + "epoch": 1.91, + "learning_rate": 3.637845670812704e-05, + "loss": 165.927, + "step": 2256, + "task_loss": 3.3922078609466553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4987640380859375, + "compression/movement_sparsity/importance_threshold": -3.7605520617489674e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9379826545333968, + "compression/movement_sparsity/model_sparsity": 0.9057600647780149, + "compression_loss": 160.2897491455078, + "distillation_loss": 6.718111991882324, + "epoch": 1.91, + "learning_rate": 3.637241879000121e-05, + "loss": 167.1852, + "step": 2257, + "task_loss": 2.5863261222839355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4987971424253932, + "compression/movement_sparsity/importance_threshold": -3.6598284143801035e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9380228747508326, + "compression/movement_sparsity/model_sparsity": 0.9057989033072499, + "compression_loss": 160.29217529296875, + "distillation_loss": 6.152707576751709, + "epoch": 1.91, + "learning_rate": 3.6366380871875377e-05, + "loss": 166.3755, + "step": 2258, + "task_loss": 2.6125926971435547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4988296503065472, + "compression/movement_sparsity/importance_threshold": -3.5609195579615685e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9380781671161601, + "compression/movement_sparsity/model_sparsity": 0.9058522962097292, + "compression_loss": 160.29444885253906, + "distillation_loss": 5.998579978942871, + "epoch": 1.91, + "learning_rate": 3.636034295374955e-05, + "loss": 166.0301, + "step": 2259, + "task_loss": 2.7233455181121826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4988615671517482, + "compression/movement_sparsity/importance_threshold": -3.4638089943918662e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9381247548391134, + "compression/movement_sparsity/model_sparsity": 0.9058972835010785, + "compression_loss": 160.2967071533203, + "distillation_loss": 5.3853936195373535, + "epoch": 1.91, + "learning_rate": 3.635430503562372e-05, + "loss": 166.3539, + "step": 2260, + "task_loss": 3.242863893508911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498892898383344, + "compression/movement_sparsity/importance_threshold": -3.36848022557297e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9381330302114527, + "compression/movement_sparsity/model_sparsity": 0.9059052745889199, + "compression_loss": 160.29898071289062, + "distillation_loss": 5.459070205688477, + "epoch": 1.91, + "learning_rate": 3.6348267117497885e-05, + "loss": 165.9188, + "step": 2261, + "task_loss": 3.122023582458496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.498923649423683, + "compression/movement_sparsity/importance_threshold": -3.2749167534051188e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9381983984984325, + "compression/movement_sparsity/model_sparsity": 0.9059683972741455, + "compression_loss": 160.30105590820312, + "distillation_loss": 6.814894676208496, + "epoch": 1.91, + "learning_rate": 3.634222919937206e-05, + "loss": 166.7034, + "step": 2262, + "task_loss": 3.489725351333618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4989538256951132, + "compression/movement_sparsity/importance_threshold": -3.1831020797876836e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9382597125684161, + "compression/movement_sparsity/model_sparsity": 0.906027605017201, + "compression_loss": 160.30328369140625, + "distillation_loss": 6.812126159667969, + "epoch": 1.91, + "learning_rate": 3.6336191281246226e-05, + "loss": 166.5372, + "step": 2263, + "task_loss": 3.029024362564087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4989834326199831, + "compression/movement_sparsity/importance_threshold": -3.0930197066217704e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9382236896579882, + "compression/movement_sparsity/model_sparsity": 0.9059928196045658, + "compression_loss": 160.3053741455078, + "distillation_loss": 8.494635581970215, + "epoch": 1.91, + "learning_rate": 3.63301533631204e-05, + "loss": 166.7912, + "step": 2264, + "task_loss": 4.287404537200928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4990124756206404, + "compression/movement_sparsity/importance_threshold": -3.0046531358084855e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9382585559241554, + "compression/movement_sparsity/model_sparsity": 0.906026488107229, + "compression_loss": 160.30738830566406, + "distillation_loss": 5.719357490539551, + "epoch": 1.91, + "learning_rate": 3.632411544499457e-05, + "loss": 166.7411, + "step": 2265, + "task_loss": 3.0443618297576904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499040960119434, + "compression/movement_sparsity/importance_threshold": -2.9179858692472e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9382844671404281, + "compression/movement_sparsity/model_sparsity": 0.9060515091935105, + "compression_loss": 160.309326171875, + "distillation_loss": 5.495291709899902, + "epoch": 1.91, + "learning_rate": 3.631807752686874e-05, + "loss": 165.9636, + "step": 2266, + "task_loss": 3.589195966720581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4990688915387116, + "compression/movement_sparsity/importance_threshold": -2.83300140883902e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9383269887222176, + "compression/movement_sparsity/model_sparsity": 0.9060925700281538, + "compression_loss": 160.31121826171875, + "distillation_loss": 4.694790840148926, + "epoch": 1.92, + "learning_rate": 3.631203960874291e-05, + "loss": 165.6907, + "step": 2267, + "task_loss": 1.6176011562347412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4990962753008215, + "compression/movement_sparsity/importance_threshold": -2.749683256484184e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9383026753444081, + "compression/movement_sparsity/model_sparsity": 0.9060690918896688, + "compression_loss": 160.3130340576172, + "distillation_loss": 6.856955528259277, + "epoch": 1.92, + "learning_rate": 3.6306001690617076e-05, + "loss": 167.0673, + "step": 2268, + "task_loss": 2.5767476558685303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4991231168281116, + "compression/movement_sparsity/importance_threshold": -2.668014914083798e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9383620219267318, + "compression/movement_sparsity/model_sparsity": 0.9061263997343182, + "compression_loss": 160.31483459472656, + "distillation_loss": 6.369436264038086, + "epoch": 1.92, + "learning_rate": 3.629996377249125e-05, + "loss": 166.8442, + "step": 2269, + "task_loss": 2.9285056591033936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4991494215429304, + "compression/movement_sparsity/importance_threshold": -2.5879798835372336e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9384354390268658, + "compression/movement_sparsity/model_sparsity": 0.9061972947312052, + "compression_loss": 160.31654357910156, + "distillation_loss": 5.047272682189941, + "epoch": 1.92, + "learning_rate": 3.629392585436542e-05, + "loss": 165.8542, + "step": 2270, + "task_loss": 2.8153228759765625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4991751948676264, + "compression/movement_sparsity/importance_threshold": -2.5095616667455967e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9384625307357345, + "compression/movement_sparsity/model_sparsity": 0.9062234557565304, + "compression_loss": 160.31805419921875, + "distillation_loss": 6.451539993286133, + "epoch": 1.92, + "learning_rate": 3.6287887936239584e-05, + "loss": 165.6697, + "step": 2271, + "task_loss": 2.927915334701538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4992004422245473, + "compression/movement_sparsity/importance_threshold": -2.432743765609126e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9385876271784024, + "compression/movement_sparsity/model_sparsity": 0.9063442547515509, + "compression_loss": 160.31951904296875, + "distillation_loss": 7.025057792663574, + "epoch": 1.92, + "learning_rate": 3.628185001811376e-05, + "loss": 166.8978, + "step": 2272, + "task_loss": 2.3560400009155273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4992251690360416, + "compression/movement_sparsity/importance_threshold": -2.3575096820280597e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9386287536325785, + "compression/movement_sparsity/model_sparsity": 0.9063839683855063, + "compression_loss": 160.3209686279297, + "distillation_loss": 7.400439262390137, + "epoch": 1.92, + "learning_rate": 3.6275812099987925e-05, + "loss": 166.7368, + "step": 2273, + "task_loss": 3.0399365425109863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4992493807244573, + "compression/movement_sparsity/importance_threshold": -2.2838429179035044e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.938641047449411, + "compression/movement_sparsity/model_sparsity": 0.9063958398719104, + "compression_loss": 160.3223419189453, + "distillation_loss": 8.527482032775879, + "epoch": 1.92, + "learning_rate": 3.62697741818621e-05, + "loss": 167.3861, + "step": 2274, + "task_loss": 4.076850891113281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4992730827121428, + "compression/movement_sparsity/importance_threshold": -2.2117269751356985e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9386525661953472, + "compression/movement_sparsity/model_sparsity": 0.906406962913488, + "compression_loss": 160.3237762451172, + "distillation_loss": 6.8045549392700195, + "epoch": 1.92, + "learning_rate": 3.6263736263736266e-05, + "loss": 166.9141, + "step": 2275, + "task_loss": 3.4388017654418945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4992962804214458, + "compression/movement_sparsity/importance_threshold": -2.1411453556248808e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9387365600321743, + "compression/movement_sparsity/model_sparsity": 0.9064880713036247, + "compression_loss": 160.3250732421875, + "distillation_loss": 5.576113700866699, + "epoch": 1.92, + "learning_rate": 3.625769834561043e-05, + "loss": 166.3438, + "step": 2276, + "task_loss": 2.7813143730163574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4993189792747152, + "compression/movement_sparsity/importance_threshold": -2.072081561272157e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9388340401025974, + "compression/movement_sparsity/model_sparsity": 0.9065822026337451, + "compression_loss": 160.3262176513672, + "distillation_loss": 6.722973823547363, + "epoch": 1.92, + "learning_rate": 3.625166042748461e-05, + "loss": 166.9828, + "step": 2277, + "task_loss": 2.5521368980407715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4993411846942988, + "compression/movement_sparsity/importance_threshold": -2.004519093976899e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9388386428313049, + "compression/movement_sparsity/model_sparsity": 0.9065866472445618, + "compression_loss": 160.32742309570312, + "distillation_loss": 6.263413429260254, + "epoch": 1.93, + "learning_rate": 3.6245622509358774e-05, + "loss": 166.5644, + "step": 2278, + "task_loss": 2.8139054775238037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4993629021025447, + "compression/movement_sparsity/importance_threshold": -1.938441455640212e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9388515686290222, + "compression/movement_sparsity/model_sparsity": 0.9065991290013631, + "compression_loss": 160.32847595214844, + "distillation_loss": 7.207777976989746, + "epoch": 1.93, + "learning_rate": 3.623958459123294e-05, + "loss": 166.597, + "step": 2279, + "task_loss": 3.802414655685425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4993841369218013, + "compression/movement_sparsity/importance_threshold": -1.8738321481632028e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9389405467679209, + "compression/movement_sparsity/model_sparsity": 0.906685050467462, + "compression_loss": 160.3295440673828, + "distillation_loss": 6.671743392944336, + "epoch": 1.93, + "learning_rate": 3.6233546673107116e-05, + "loss": 167.1642, + "step": 2280, + "task_loss": 2.805103063583374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4994048945744167, + "compression/movement_sparsity/importance_threshold": -1.8106746734452422e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9389457337808426, + "compression/movement_sparsity/model_sparsity": 0.9066900592905326, + "compression_loss": 160.33045959472656, + "distillation_loss": 5.942516803741455, + "epoch": 1.93, + "learning_rate": 3.622750875498128e-05, + "loss": 166.639, + "step": 2281, + "task_loss": 2.929370403289795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4994251804827392, + "compression/movement_sparsity/importance_threshold": -1.748952533386569e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9390214760936656, + "compression/movement_sparsity/model_sparsity": 0.9067631996218994, + "compression_loss": 160.3313751220703, + "distillation_loss": 6.769024848937988, + "epoch": 1.93, + "learning_rate": 3.622147083685546e-05, + "loss": 166.5939, + "step": 2282, + "task_loss": 3.039930820465088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4994450000691169, + "compression/movement_sparsity/importance_threshold": -1.6886492298882894e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9390886687782937, + "compression/movement_sparsity/model_sparsity": 0.9068280840311017, + "compression_loss": 160.33216857910156, + "distillation_loss": 6.010128021240234, + "epoch": 1.93, + "learning_rate": 3.6215432918729624e-05, + "loss": 166.4873, + "step": 2283, + "task_loss": 3.6313719749450684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499464358755898, + "compression/movement_sparsity/importance_threshold": -1.629748264851509e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9391763352587525, + "compression/movement_sparsity/model_sparsity": 0.9069127388982632, + "compression_loss": 160.33290100097656, + "distillation_loss": 6.367352485656738, + "epoch": 1.93, + "learning_rate": 3.62093950006038e-05, + "loss": 166.7826, + "step": 2284, + "task_loss": 2.403203248977661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4994832619654308, + "compression/movement_sparsity/importance_threshold": -1.5722331401755996e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9391981564855262, + "compression/movement_sparsity/model_sparsity": 0.9069338104987672, + "compression_loss": 160.33380126953125, + "distillation_loss": 6.181642532348633, + "epoch": 1.93, + "learning_rate": 3.6203357082477965e-05, + "loss": 166.2573, + "step": 2285, + "task_loss": 3.247931957244873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995017151200631, + "compression/movement_sparsity/importance_threshold": -1.5160873577616668e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.939294110262492, + "compression/movement_sparsity/model_sparsity": 0.9070264679683059, + "compression_loss": 160.33441162109375, + "distillation_loss": 5.73525333404541, + "epoch": 1.93, + "learning_rate": 3.619731916435213e-05, + "loss": 166.7404, + "step": 2286, + "task_loss": 2.8805460929870605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995197236421438, + "compression/movement_sparsity/importance_threshold": -1.4612944195090821e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9392826630615615, + "compression/movement_sparsity/model_sparsity": 0.9070154140139431, + "compression_loss": 160.3350372314453, + "distillation_loss": 6.634737014770508, + "epoch": 1.93, + "learning_rate": 3.6191281246226306e-05, + "loss": 166.7164, + "step": 2287, + "task_loss": 3.3304269313812256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995372929540205, + "compression/movement_sparsity/importance_threshold": -1.4078378273198189e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.939242860189993, + "compression/movement_sparsity/model_sparsity": 0.9069769784934609, + "compression_loss": 160.33563232421875, + "distillation_loss": 6.07640266418457, + "epoch": 1.93, + "learning_rate": 3.6185243328100473e-05, + "loss": 166.5357, + "step": 2288, + "task_loss": 2.351408004760742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995544284780418, + "compression/movement_sparsity/importance_threshold": -1.3557010830941157e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.939326985192664, + "compression/movement_sparsity/model_sparsity": 0.9070582135434915, + "compression_loss": 160.33609008789062, + "distillation_loss": 5.704773902893066, + "epoch": 1.93, + "learning_rate": 3.617920540997464e-05, + "loss": 166.453, + "step": 2289, + "task_loss": 2.63021183013916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995711356365553, + "compression/movement_sparsity/importance_threshold": -1.304867688731344e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9393338773615576, + "compression/movement_sparsity/model_sparsity": 0.9070648689451807, + "compression_loss": 160.336669921875, + "distillation_loss": 7.515334129333496, + "epoch": 1.94, + "learning_rate": 3.6173167491848815e-05, + "loss": 166.993, + "step": 2290, + "task_loss": 2.848987340927124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4995874198519097, + "compression/movement_sparsity/importance_threshold": -1.2553211461326097e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9393913876220653, + "compression/movement_sparsity/model_sparsity": 0.9071204035513177, + "compression_loss": 160.33712768554688, + "distillation_loss": 6.515684127807617, + "epoch": 1.94, + "learning_rate": 3.616712957372298e-05, + "loss": 166.7115, + "step": 2291, + "task_loss": 4.2849321365356445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4996032865464533, + "compression/movement_sparsity/importance_threshold": -1.2070449571981515e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9394231059079767, + "compression/movement_sparsity/model_sparsity": 0.9071510322165313, + "compression_loss": 160.3374481201172, + "distillation_loss": 6.23954963684082, + "epoch": 1.94, + "learning_rate": 3.616109165559715e-05, + "loss": 166.5163, + "step": 2292, + "task_loss": 3.813804864883423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499618741142534, + "compression/movement_sparsity/importance_threshold": -1.1600226238299427e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9395280624315075, + "compression/movement_sparsity/model_sparsity": 0.9072523831605948, + "compression_loss": 160.3378448486328, + "distillation_loss": 7.553248405456543, + "epoch": 1.94, + "learning_rate": 3.615505373747132e-05, + "loss": 166.6389, + "step": 2293, + "task_loss": 3.37321138381958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4996337890625, + "compression/movement_sparsity/importance_threshold": -1.11423764792562e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9395923575434002, + "compression/movement_sparsity/model_sparsity": 0.907314469537599, + "compression_loss": 160.33815002441406, + "distillation_loss": 6.544445037841797, + "epoch": 1.94, + "learning_rate": 3.614901581934549e-05, + "loss": 166.6987, + "step": 2294, + "task_loss": 3.0458126068115234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4996484357286994, + "compression/movement_sparsity/importance_threshold": -1.069673531388024e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9396370254753641, + "compression/movement_sparsity/model_sparsity": 0.9073576029886853, + "compression_loss": 160.33839416503906, + "distillation_loss": 5.859119415283203, + "epoch": 1.94, + "learning_rate": 3.614297790121966e-05, + "loss": 166.3688, + "step": 2295, + "task_loss": 2.3848659992218018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499662686563481, + "compression/movement_sparsity/importance_threshold": -1.0263137761165261e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.939689718372147, + "compression/movement_sparsity/model_sparsity": 0.9074084857223613, + "compression_loss": 160.33863830566406, + "distillation_loss": 5.787991046905518, + "epoch": 1.94, + "learning_rate": 3.613693998309383e-05, + "loss": 166.0734, + "step": 2296, + "task_loss": 4.380331516265869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4996765469891922, + "compression/movement_sparsity/importance_threshold": -9.84141884011365e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9397602975203837, + "compression/movement_sparsity/model_sparsity": 0.9074766402597292, + "compression_loss": 160.33872985839844, + "distillation_loss": 6.22559928894043, + "epoch": 1.94, + "learning_rate": 3.6130902064968005e-05, + "loss": 166.1445, + "step": 2297, + "task_loss": 3.5651767253875732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4996900224281817, + "compression/movement_sparsity/importance_threshold": -9.431413569736466e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9398338815588646, + "compression/movement_sparsity/model_sparsity": 0.9075476964601173, + "compression_loss": 160.33876037597656, + "distillation_loss": 8.017066955566406, + "epoch": 1.94, + "learning_rate": 3.6124864146842166e-05, + "loss": 167.2958, + "step": 2298, + "task_loss": 4.0405192375183105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997031183027976, + "compression/movement_sparsity/importance_threshold": -9.03295696904477e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9398198587377248, + "compression/movement_sparsity/model_sparsity": 0.907534155366023, + "compression_loss": 160.33883666992188, + "distillation_loss": 6.559440612792969, + "epoch": 1.94, + "learning_rate": 3.611882622871634e-05, + "loss": 166.792, + "step": 2299, + "task_loss": 3.385913610458374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997158400353878, + "compression/movement_sparsity/importance_threshold": -8.645884057023601e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9398600670309929, + "compression/movement_sparsity/model_sparsity": 0.9075729823807221, + "compression_loss": 160.33889770507812, + "distillation_loss": 7.03973388671875, + "epoch": 1.94, + "learning_rate": 3.6112788310590513e-05, + "loss": 167.3513, + "step": 2300, + "task_loss": 4.346315383911133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499728193048301, + "compression/movement_sparsity/importance_threshold": -8.270029852701366e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9399225258210696, + "compression/movement_sparsity/model_sparsity": 0.907633295519214, + "compression_loss": 160.33889770507812, + "distillation_loss": 8.175253868103027, + "epoch": 1.94, + "learning_rate": 3.610675039246468e-05, + "loss": 167.1721, + "step": 2301, + "task_loss": 3.252653121948242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499740182763885, + "compression/movement_sparsity/importance_threshold": -7.905229375063105e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9399236466928274, + "compression/movement_sparsity/model_sparsity": 0.9076343778855787, + "compression_loss": 160.33888244628906, + "distillation_loss": 6.105640411376953, + "epoch": 1.95, + "learning_rate": 3.610071247433885e-05, + "loss": 166.3246, + "step": 2302, + "task_loss": 2.7947325706481934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997518146044881, + "compression/movement_sparsity/importance_threshold": -7.551317643119879e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9399670387388542, + "compression/movement_sparsity/model_sparsity": 0.907676279281335, + "compression_loss": 160.3389129638672, + "distillation_loss": 6.309852600097656, + "epoch": 1.95, + "learning_rate": 3.609467455621302e-05, + "loss": 166.3784, + "step": 2303, + "task_loss": 3.589118480682373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997630939924584, + "compression/movement_sparsity/importance_threshold": -7.208129675874073e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9399956209686774, + "compression/movement_sparsity/model_sparsity": 0.9077038796236345, + "compression_loss": 160.33880615234375, + "distillation_loss": 4.6221184730529785, + "epoch": 1.95, + "learning_rate": 3.608863663808719e-05, + "loss": 166.0841, + "step": 2304, + "task_loss": 3.0640878677368164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997740263501445, + "compression/movement_sparsity/importance_threshold": -6.875500492345421e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9400290205622255, + "compression/movement_sparsity/model_sparsity": 0.907736131838395, + "compression_loss": 160.3385467529297, + "distillation_loss": 4.470045566558838, + "epoch": 1.95, + "learning_rate": 3.6082598719961356e-05, + "loss": 165.3681, + "step": 2305, + "task_loss": 2.0147652626037598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499784617099894, + "compression/movement_sparsity/importance_threshold": -6.55326511151029e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9400840386716973, + "compression/movement_sparsity/model_sparsity": 0.907789259906551, + "compression_loss": 160.33851623535156, + "distillation_loss": 9.148815155029297, + "epoch": 1.95, + "learning_rate": 3.607656080183553e-05, + "loss": 167.3632, + "step": 2306, + "task_loss": 3.2318601608276367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4997948716640557, + "compression/movement_sparsity/importance_threshold": -6.241258552397086e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9401128593848732, + "compression/movement_sparsity/model_sparsity": 0.9078170905395664, + "compression_loss": 160.3382110595703, + "distillation_loss": 5.4949727058410645, + "epoch": 1.95, + "learning_rate": 3.6070522883709704e-05, + "loss": 166.1826, + "step": 2307, + "task_loss": 2.8028347492218018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998047954649776, + "compression/movement_sparsity/importance_threshold": -5.939315834008196e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9401092463620796, + "compression/movement_sparsity/model_sparsity": 0.9078136016352207, + "compression_loss": 160.33786010742188, + "distillation_loss": 7.650700092315674, + "epoch": 1.95, + "learning_rate": 3.6064484965583864e-05, + "loss": 166.4169, + "step": 2308, + "task_loss": 3.234161376953125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998143939250075, + "compression/movement_sparsity/importance_threshold": -5.647271975337334e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9401593040178149, + "compression/movement_sparsity/model_sparsity": 0.9078619396564862, + "compression_loss": 160.33767700195312, + "distillation_loss": 6.528079986572266, + "epoch": 1.95, + "learning_rate": 3.605844704745804e-05, + "loss": 165.8884, + "step": 2309, + "task_loss": 2.630993366241455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499823672466494, + "compression/movement_sparsity/importance_threshold": -5.364961995386885e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.940188410911014, + "compression/movement_sparsity/model_sparsity": 0.9078900466383606, + "compression_loss": 160.33731079101562, + "distillation_loss": 5.694217205047607, + "epoch": 1.95, + "learning_rate": 3.605240912933221e-05, + "loss": 165.9421, + "step": 2310, + "task_loss": 2.8399152755737305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499832636511785, + "compression/movement_sparsity/importance_threshold": -5.092220913185258e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9402290246259818, + "compression/movement_sparsity/model_sparsity": 0.9079292651472769, + "compression_loss": 160.33702087402344, + "distillation_loss": 6.948454856872559, + "epoch": 1.95, + "learning_rate": 3.604637121120637e-05, + "loss": 166.7524, + "step": 2311, + "task_loss": 2.875445604324341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499841291483229, + "compression/movement_sparsity/importance_threshold": -4.828883747708818e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9402881684974557, + "compression/movement_sparsity/model_sparsity": 0.9079863772448178, + "compression_loss": 160.3366241455078, + "distillation_loss": 7.841369152069092, + "epoch": 1.95, + "learning_rate": 3.604033329308055e-05, + "loss": 166.2384, + "step": 2312, + "task_loss": 3.943207263946533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499849642803174, + "compression/movement_sparsity/importance_threshold": -4.574785517977298e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9403017620485605, + "compression/movement_sparsity/model_sparsity": 0.9079995038156236, + "compression_loss": 160.33612060546875, + "distillation_loss": 6.916234970092773, + "epoch": 1.95, + "learning_rate": 3.603429537495472e-05, + "loss": 167.5723, + "step": 2313, + "task_loss": 4.204134464263916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998576958939684, + "compression/movement_sparsity/importance_threshold": -4.3297612429930854e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.940318384338245, + "compression/movement_sparsity/model_sparsity": 0.9080155550785212, + "compression_loss": 160.33570861816406, + "distillation_loss": 6.801494598388672, + "epoch": 1.96, + "learning_rate": 3.602825745682888e-05, + "loss": 166.8725, + "step": 2314, + "task_loss": 3.3035645484924316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998654561779603, + "compression/movement_sparsity/importance_threshold": -4.093645941758567e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9403815466542121, + "compression/movement_sparsity/model_sparsity": 0.9080765475746249, + "compression_loss": 160.33543395996094, + "distillation_loss": 8.283699035644531, + "epoch": 1.96, + "learning_rate": 3.6022219538703055e-05, + "loss": 166.9866, + "step": 2315, + "task_loss": 3.5167155265808105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499872929077498, + "compression/movement_sparsity/importance_threshold": -3.866274633276129e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9404351338635677, + "compression/movement_sparsity/model_sparsity": 0.9081282938984855, + "compression_loss": 160.33493041992188, + "distillation_loss": 6.499887466430664, + "epoch": 1.96, + "learning_rate": 3.601618162057723e-05, + "loss": 166.494, + "step": 2316, + "task_loss": 3.560060739517212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998801200149292, + "compression/movement_sparsity/importance_threshold": -3.6474823365568315e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9405057130118043, + "compression/movement_sparsity/model_sparsity": 0.9081964484358533, + "compression_loss": 160.33448791503906, + "distillation_loss": 5.3695502281188965, + "epoch": 1.96, + "learning_rate": 3.6010143702451396e-05, + "loss": 166.0111, + "step": 2317, + "task_loss": 3.145541191101074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998870344126027, + "compression/movement_sparsity/importance_threshold": -3.4371040706030614e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.94054416845243, + "compression/movement_sparsity/model_sparsity": 0.9082335828137907, + "compression_loss": 160.33404541015625, + "distillation_loss": 5.96377420425415, + "epoch": 1.96, + "learning_rate": 3.6004105784325563e-05, + "loss": 166.1986, + "step": 2318, + "task_loss": 2.40494704246521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4998936776928664, + "compression/movement_sparsity/importance_threshold": -3.234974854417205e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9405915670187825, + "compression/movement_sparsity/model_sparsity": 0.9082793530935741, + "compression_loss": 160.33348083496094, + "distillation_loss": 6.279591083526611, + "epoch": 1.96, + "learning_rate": 3.599806786619974e-05, + "loss": 166.3923, + "step": 2319, + "task_loss": 2.278113842010498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999000552780686, + "compression/movement_sparsity/importance_threshold": -3.040929707010323e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9406146522073257, + "compression/movement_sparsity/model_sparsity": 0.9083016452348723, + "compression_loss": 160.3328857421875, + "distillation_loss": 7.564520835876465, + "epoch": 1.96, + "learning_rate": 3.5992029948073905e-05, + "loss": 167.1486, + "step": 2320, + "task_loss": 4.302159309387207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999061725905571, + "compression/movement_sparsity/importance_threshold": -2.854803647376128e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9406433059821546, + "compression/movement_sparsity/model_sparsity": 0.9083293146643866, + "compression_loss": 160.3323516845703, + "distillation_loss": 5.496919631958008, + "epoch": 1.96, + "learning_rate": 3.598599202994807e-05, + "loss": 166.971, + "step": 2321, + "task_loss": 3.1690382957458496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499912035052681, + "compression/movement_sparsity/importance_threshold": -2.6764316945343536e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9407071956723475, + "compression/movement_sparsity/model_sparsity": 0.9083910095471738, + "compression_loss": 160.33177185058594, + "distillation_loss": 5.889617919921875, + "epoch": 1.96, + "learning_rate": 3.5979954111822246e-05, + "loss": 166.5717, + "step": 2322, + "task_loss": 3.0783376693725586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999176480867873, + "compression/movement_sparsity/importance_threshold": -2.5056488674700395e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9407848577761599, + "compression/movement_sparsity/model_sparsity": 0.9084660037188036, + "compression_loss": 160.3311004638672, + "distillation_loss": 6.857034683227539, + "epoch": 1.96, + "learning_rate": 3.597391619369642e-05, + "loss": 166.5412, + "step": 2323, + "task_loss": 2.7734100818634033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999230171152251, + "compression/movement_sparsity/importance_threshold": -2.3422901852029193e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9408720472899134, + "compression/movement_sparsity/model_sparsity": 0.9085501980045333, + "compression_loss": 160.3304901123047, + "distillation_loss": 6.121474266052246, + "epoch": 1.96, + "learning_rate": 3.596787827557058e-05, + "loss": 166.3995, + "step": 2324, + "task_loss": 2.63283634185791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999281475603423, + "compression/movement_sparsity/importance_threshold": -2.1861906667353798e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9408993059371289, + "compression/movement_sparsity/model_sparsity": 0.9085765202333596, + "compression_loss": 160.3297576904297, + "distillation_loss": 6.814990043640137, + "epoch": 1.96, + "learning_rate": 3.5961840357444754e-05, + "loss": 167.1171, + "step": 2325, + "task_loss": 3.5269968509674072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999330448444872, + "compression/movement_sparsity/importance_threshold": -2.0371853310611338e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9409677029626883, + "compression/movement_sparsity/model_sparsity": 0.9086425676106771, + "compression_loss": 160.32911682128906, + "distillation_loss": 6.245388984680176, + "epoch": 1.97, + "learning_rate": 3.595580243931893e-05, + "loss": 166.9564, + "step": 2326, + "task_loss": 3.6393094062805176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499937714390008, + "compression/movement_sparsity/importance_threshold": -1.895109197199915e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9410301140560944, + "compression/movement_sparsity/model_sparsity": 0.9087028346910256, + "compression_loss": 160.32839965820312, + "distillation_loss": 6.248898506164551, + "epoch": 1.97, + "learning_rate": 3.5949764521193095e-05, + "loss": 165.8482, + "step": 2327, + "task_loss": 3.804799795150757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999421616192525, + "compression/movement_sparsity/importance_threshold": -1.7597972841541104e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9410607949394214, + "compression/movement_sparsity/model_sparsity": 0.908732461591625, + "compression_loss": 160.3275604248047, + "distillation_loss": 5.294013977050781, + "epoch": 1.97, + "learning_rate": 3.594372660306726e-05, + "loss": 166.092, + "step": 2328, + "task_loss": 2.438602924346924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999463919545695, + "compression/movement_sparsity/importance_threshold": -1.6310846109174326e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9410361715332534, + "compression/movement_sparsity/model_sparsity": 0.9087086840752093, + "compression_loss": 160.3267364501953, + "distillation_loss": 6.122711658477783, + "epoch": 1.97, + "learning_rate": 3.5937688684941436e-05, + "loss": 166.5029, + "step": 2329, + "task_loss": 3.0187790393829346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999504108183066, + "compression/movement_sparsity/importance_threshold": -1.508806196500942e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9410731841495951, + "compression/movement_sparsity/model_sparsity": 0.9087444251943155, + "compression_loss": 160.3258819580078, + "distillation_loss": 7.186318397521973, + "epoch": 1.97, + "learning_rate": 3.5931650766815604e-05, + "loss": 167.4734, + "step": 2330, + "task_loss": 3.3950819969177246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999542236328125, + "compression/movement_sparsity/importance_threshold": -1.392797059907025e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9411659184012992, + "compression/movement_sparsity/model_sparsity": 0.9088339737391897, + "compression_loss": 160.32492065429688, + "distillation_loss": 7.055994033813477, + "epoch": 1.97, + "learning_rate": 3.592561284868977e-05, + "loss": 166.727, + "step": 2331, + "task_loss": 3.6077022552490234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999578358204353, + "compression/movement_sparsity/importance_threshold": -1.282892220146742e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9411830534301919, + "compression/movement_sparsity/model_sparsity": 0.9088505201271264, + "compression_loss": 160.3240509033203, + "distillation_loss": 6.517487525939941, + "epoch": 1.97, + "learning_rate": 3.5919574930563945e-05, + "loss": 166.131, + "step": 2332, + "task_loss": 3.4898476600646973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999612528035227, + "compression/movement_sparsity/importance_threshold": -1.1789266962224793e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9411969927821583, + "compression/movement_sparsity/model_sparsity": 0.9088639806194703, + "compression_loss": 160.32321166992188, + "distillation_loss": 5.848492622375488, + "epoch": 1.97, + "learning_rate": 3.591353701243811e-05, + "loss": 166.6086, + "step": 2333, + "task_loss": 3.5669665336608887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999644800044236, + "compression/movement_sparsity/importance_threshold": -1.0807355071279501e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.941202764079294, + "compression/movement_sparsity/model_sparsity": 0.9088695536547948, + "compression_loss": 160.3223114013672, + "distillation_loss": 8.408719062805176, + "epoch": 1.97, + "learning_rate": 3.590749909431228e-05, + "loss": 167.1881, + "step": 2334, + "task_loss": 4.510208606719971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999675228454856, + "compression/movement_sparsity/importance_threshold": -9.881536718828882e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9412543241801514, + "compression/movement_sparsity/model_sparsity": 0.9089193425075704, + "compression_loss": 160.3214111328125, + "distillation_loss": 4.212553024291992, + "epoch": 1.97, + "learning_rate": 3.590146117618645e-05, + "loss": 166.0254, + "step": 2335, + "task_loss": 1.791365623474121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999703867490572, + "compression/movement_sparsity/importance_threshold": -9.010162094896801e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9412577702645982, + "compression/movement_sparsity/model_sparsity": 0.9089226702084151, + "compression_loss": 160.3204345703125, + "distillation_loss": 7.156421661376953, + "epoch": 1.97, + "learning_rate": 3.589542325806062e-05, + "loss": 166.9149, + "step": 2336, + "task_loss": 3.752372980117798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999730771374868, + "compression/movement_sparsity/importance_threshold": -8.191581389420388e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.941311154763104, + "compression/movement_sparsity/model_sparsity": 0.9089742207851672, + "compression_loss": 160.3194580078125, + "distillation_loss": 7.032711029052734, + "epoch": 1.97, + "learning_rate": 3.5889385339934794e-05, + "loss": 166.0417, + "step": 2337, + "task_loss": 2.7310333251953125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499975599433122, + "compression/movement_sparsity/importance_threshold": -7.424144792510246e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9413510768763489, + "compression/movement_sparsity/model_sparsity": 0.9090127714510073, + "compression_loss": 160.3184356689453, + "distillation_loss": 5.627411842346191, + "epoch": 1.98, + "learning_rate": 3.588334742180896e-05, + "loss": 166.1268, + "step": 2338, + "task_loss": 4.962368011474609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999779590583118, + "compression/movement_sparsity/importance_threshold": -6.706202494276975e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414048667965542, + "compression/movement_sparsity/model_sparsity": 0.9090647135219765, + "compression_loss": 160.3174285888672, + "distillation_loss": 6.0490217208862305, + "epoch": 1.98, + "learning_rate": 3.5877309503683135e-05, + "loss": 165.0236, + "step": 2339, + "task_loss": 2.152772903442383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999801614354036, + "compression/movement_sparsity/importance_threshold": -6.036104684657706e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9413920721646809, + "compression/movement_sparsity/model_sparsity": 0.9090523584250689, + "compression_loss": 160.31634521484375, + "distillation_loss": 7.236522674560547, + "epoch": 1.98, + "learning_rate": 3.58712715855573e-05, + "loss": 166.6133, + "step": 2340, + "task_loss": 3.228642225265503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499982211986746, + "compression/movement_sparsity/importance_threshold": -5.4122015536763046e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414497612877033, + "compression/movement_sparsity/model_sparsity": 0.909108065749243, + "compression_loss": 160.3153533935547, + "distillation_loss": 7.498824119567871, + "epoch": 1.98, + "learning_rate": 3.586523366743147e-05, + "loss": 166.5062, + "step": 2341, + "task_loss": 3.1690895557403564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999841161346872, + "compression/movement_sparsity/importance_threshold": -4.832843291616845e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414312549795324, + "compression/movement_sparsity/model_sparsity": 0.9090901951896898, + "compression_loss": 160.3143768310547, + "distillation_loss": 7.292119979858398, + "epoch": 1.98, + "learning_rate": 3.5859195749305644e-05, + "loss": 165.8706, + "step": 2342, + "task_loss": 3.551690101623535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999858793015752, + "compression/movement_sparsity/importance_threshold": -4.296380088242985e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414406273752942, + "compression/movement_sparsity/model_sparsity": 0.9090992456148244, + "compression_loss": 160.31329345703125, + "distillation_loss": 6.0655317306518555, + "epoch": 1.98, + "learning_rate": 3.585315783117981e-05, + "loss": 167.0495, + "step": 2343, + "task_loss": 2.7222015857696533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999875069097586, + "compression/movement_sparsity/importance_threshold": -3.801162133752062e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414345222014646, + "compression/movement_sparsity/model_sparsity": 0.9090933501724975, + "compression_loss": 160.31227111816406, + "distillation_loss": 5.832024574279785, + "epoch": 1.98, + "learning_rate": 3.584711991305398e-05, + "loss": 165.6878, + "step": 2344, + "task_loss": 2.6053621768951416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999890043815851, + "compression/movement_sparsity/importance_threshold": -3.345539618167942e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414276896534093, + "compression/movement_sparsity/model_sparsity": 0.9090867523434872, + "compression_loss": 160.3112030029297, + "distillation_loss": 5.7712554931640625, + "epoch": 1.98, + "learning_rate": 3.584108199492815e-05, + "loss": 166.9068, + "step": 2345, + "task_loss": 3.3420400619506836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999903771394032, + "compression/movement_sparsity/importance_threshold": -2.9278627315144912e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414371574425121, + "compression/movement_sparsity/model_sparsity": 0.9090958948849082, + "compression_loss": 160.3101806640625, + "distillation_loss": 5.332365989685059, + "epoch": 1.98, + "learning_rate": 3.583504407680232e-05, + "loss": 165.5881, + "step": 2346, + "task_loss": 2.162992238998413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999916306055607, + "compression/movement_sparsity/importance_threshold": -2.5464816638155752e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414648572839303, + "compression/movement_sparsity/model_sparsity": 0.9091226431515589, + "compression_loss": 160.30894470214844, + "distillation_loss": 6.155909061431885, + "epoch": 1.98, + "learning_rate": 3.582900615867649e-05, + "loss": 165.6931, + "step": 2347, + "task_loss": 3.164666175842285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999927702024065, + "compression/movement_sparsity/importance_threshold": -2.199746605181796e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9414698654343373, + "compression/movement_sparsity/model_sparsity": 0.9091274792565925, + "compression_loss": 160.3078155517578, + "distillation_loss": 5.989180564880371, + "epoch": 1.98, + "learning_rate": 3.582296824055066e-05, + "loss": 166.4385, + "step": 2348, + "task_loss": 3.946364402770996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999938013522884, + "compression/movement_sparsity/importance_threshold": -1.8860077456370195e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9415318591818762, + "compression/movement_sparsity/model_sparsity": 0.9091873433281884, + "compression_loss": 160.306640625, + "distillation_loss": 8.105067253112793, + "epoch": 1.99, + "learning_rate": 3.581693032242483e-05, + "loss": 166.4561, + "step": 2349, + "task_loss": 3.174832344055176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999947294775544, + "compression/movement_sparsity/importance_threshold": -1.6036152752051114e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9415299513150545, + "compression/movement_sparsity/model_sparsity": 0.9091855010024612, + "compression_loss": 160.30557250976562, + "distillation_loss": 6.455706596374512, + "epoch": 1.99, + "learning_rate": 3.5810892404299e-05, + "loss": 166.6083, + "step": 2350, + "task_loss": 4.323982238769531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999955600005528, + "compression/movement_sparsity/importance_threshold": -1.3509193839099376e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9416095093615209, + "compression/movement_sparsity/model_sparsity": 0.9092623259852824, + "compression_loss": 160.30433654785156, + "distillation_loss": 7.006874084472656, + "epoch": 1.99, + "learning_rate": 3.580485448617317e-05, + "loss": 167.0213, + "step": 2351, + "task_loss": 3.4319732189178467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999962983436324, + "compression/movement_sparsity/importance_threshold": -1.1262702618621001e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9416884711996056, + "compression/movement_sparsity/model_sparsity": 0.9093385752413139, + "compression_loss": 160.3032684326172, + "distillation_loss": 6.511745452880859, + "epoch": 1.99, + "learning_rate": 3.5798816568047336e-05, + "loss": 166.3171, + "step": 2352, + "task_loss": 3.790663242340088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999969499291403, + "compression/movement_sparsity/importance_threshold": -9.280180990854647e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9417176734861459, + "compression/movement_sparsity/model_sparsity": 0.9093667743394747, + "compression_loss": 160.30201721191406, + "distillation_loss": 7.695958137512207, + "epoch": 1.99, + "learning_rate": 3.579277864992151e-05, + "loss": 166.9726, + "step": 2353, + "task_loss": 3.62270450592041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999975201794253, + "compression/movement_sparsity/importance_threshold": -7.545130856038973e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9417636650007174, + "compression/movement_sparsity/model_sparsity": 0.9094111859040342, + "compression_loss": 160.3009033203125, + "distillation_loss": 7.470867156982422, + "epoch": 1.99, + "learning_rate": 3.578674073179568e-05, + "loss": 166.7849, + "step": 2354, + "task_loss": 2.661835193634033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499998014516836, + "compression/movement_sparsity/importance_threshold": -6.041054114412636e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9418008803279089, + "compression/movement_sparsity/model_sparsity": 0.909447122770249, + "compression_loss": 160.29977416992188, + "distillation_loss": 6.278006553649902, + "epoch": 1.99, + "learning_rate": 3.5780702813669844e-05, + "loss": 166.408, + "step": 2355, + "task_loss": 2.809206485748291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.49999843836372, + "compression/movement_sparsity/importance_threshold": -4.751452667081657e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.941894008077145, + "compression/movement_sparsity/model_sparsity": 0.9095370512948043, + "compression_loss": 160.2986602783203, + "distillation_loss": 6.513699531555176, + "epoch": 1.99, + "learning_rate": 3.577466489554402e-05, + "loss": 166.3174, + "step": 2356, + "task_loss": 3.2685694694519043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999987971424256, + "compression/movement_sparsity/importance_threshold": -3.6598284142846937e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9419025577053399, + "compression/movement_sparsity/model_sparsity": 0.909545307216969, + "compression_loss": 160.2974090576172, + "distillation_loss": 8.049238204956055, + "epoch": 1.99, + "learning_rate": 3.576862697741819e-05, + "loss": 166.8908, + "step": 2357, + "task_loss": 3.2528319358825684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999990962753007, + "compression/movement_sparsity/importance_threshold": -2.7496832562604046e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9419213144210311, + "compression/movement_sparsity/model_sparsity": 0.9095634195817739, + "compression_loss": 160.2963409423828, + "distillation_loss": 7.633810520172119, + "epoch": 1.99, + "learning_rate": 3.576258905929236e-05, + "loss": 166.9125, + "step": 2358, + "task_loss": 3.777378559112549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999993411846944, + "compression/movement_sparsity/importance_threshold": -2.0045190941148094e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.941962643586057, + "compression/movement_sparsity/model_sparsity": 0.9096033289628378, + "compression_loss": 160.2950897216797, + "distillation_loss": 7.555692195892334, + "epoch": 1.99, + "learning_rate": 3.5756551141166526e-05, + "loss": 166.6265, + "step": 2359, + "task_loss": 3.1981210708618164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499999537292954, + "compression/movement_sparsity/importance_threshold": -1.407837827219205e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.941944173050389, + "compression/movement_sparsity/model_sparsity": 0.909585492946892, + "compression_loss": 160.2939910888672, + "distillation_loss": 8.960050582885742, + "epoch": 1.99, + "learning_rate": 3.57505132230407e-05, + "loss": 166.9505, + "step": 2360, + "task_loss": 4.03400182723999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999996900224282, + "compression/movement_sparsity/importance_threshold": -9.43141356679611e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9419781211556484, + "compression/movement_sparsity/model_sparsity": 0.9096182748302991, + "compression_loss": 160.29286193847656, + "distillation_loss": 5.304047584533691, + "epoch": 2.0, + "learning_rate": 3.574447530491487e-05, + "loss": 166.2889, + "step": 2361, + "task_loss": 2.0510902404785156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499999804795465, + "compression/movement_sparsity/importance_threshold": -5.939315836020476e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9420176616955289, + "compression/movement_sparsity/model_sparsity": 0.9096564570309937, + "compression_loss": 160.29153442382812, + "distillation_loss": 8.29934024810791, + "epoch": 2.0, + "learning_rate": 3.5738437386789035e-05, + "loss": 167.4791, + "step": 2362, + "task_loss": 3.6830313205718994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999998870344127, + "compression/movement_sparsity/importance_threshold": -3.437104073578112e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9420447414802299, + "compression/movement_sparsity/model_sparsity": 0.9096826065417831, + "compression_loss": 160.29046630859375, + "distillation_loss": 5.714633464813232, + "epoch": 2.0, + "learning_rate": 3.573239946866321e-05, + "loss": 166.59, + "step": 2363, + "task_loss": 2.2213969230651855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999999421616192, + "compression/movement_sparsity/importance_threshold": -1.7597972818556018e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.942013989051897, + "compression/movement_sparsity/model_sparsity": 0.909652910553969, + "compression_loss": 160.289306640625, + "distillation_loss": 6.498959064483643, + "epoch": 2.0, + "learning_rate": 3.5726361550537376e-05, + "loss": 166.1531, + "step": 2364, + "task_loss": 3.3319220542907715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.499999975599433, + "compression/movement_sparsity/importance_threshold": -7.424144805867616e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.9420308379007665, + "compression/movement_sparsity/model_sparsity": 0.9096691805930467, + "compression_loss": 160.28810119628906, + "distillation_loss": 7.490935802459717, + "epoch": 2.0, + "learning_rate": 3.572032363241154e-05, + "loss": 167.1614, + "step": 2365, + "task_loss": 3.5815749168395996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.4999999927702024, + "compression/movement_sparsity/importance_threshold": -2.1997466348455674e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.9420491772705905, + "compression/movement_sparsity/model_sparsity": 0.9096868899490987, + "compression_loss": 160.28704833984375, + "distillation_loss": 6.73069953918457, + "epoch": 2.0, + "learning_rate": 3.571428571428572e-05, + "loss": 166.1742, + "step": 2366, + "task_loss": 2.4928009510040283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 5.794016361236572, + "epoch": 2.0, + "learning_rate": 3.570824779615989e-05, + "loss": 131.6317, + "step": 2367, + "task_loss": 2.8381009101867676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 6.008797645568848, + "epoch": 2.0, + "learning_rate": 3.570220987803405e-05, + "loss": 5.5539, + "step": 2368, + "task_loss": 2.2601335048675537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 4.821380615234375, + "epoch": 2.0, + "learning_rate": 3.5696171959908225e-05, + "loss": 5.18, + "step": 2369, + "task_loss": 2.0644631385803223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 4.630273818969727, + "epoch": 2.0, + "learning_rate": 3.56901340417824e-05, + "loss": 4.5273, + "step": 2370, + "task_loss": 2.232436418533325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 5.392454147338867, + "epoch": 2.0, + "learning_rate": 3.568409612365656e-05, + "loss": 4.0314, + "step": 2371, + "task_loss": 2.859858512878418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 3.579754114151001, + "epoch": 2.01, + "learning_rate": 3.5678058205530734e-05, + "loss": 3.8648, + "step": 2372, + "task_loss": 2.16394305229187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 4.485686302185059, + "epoch": 2.01, + "learning_rate": 3.567202028740491e-05, + "loss": 3.5806, + "step": 2373, + "task_loss": 1.6243481636047363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 3.8978517055511475, + "epoch": 2.01, + "learning_rate": 3.5665982369279075e-05, + "loss": 3.6027, + "step": 2374, + "task_loss": 1.5872914791107178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.2786474227905273, + "epoch": 2.01, + "learning_rate": 3.565994445115324e-05, + "loss": 3.3165, + "step": 2375, + "task_loss": 0.8725880980491638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.767122507095337, + "epoch": 2.01, + "learning_rate": 3.5653906533027416e-05, + "loss": 2.4885, + "step": 2376, + "task_loss": 1.1516541242599487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 3.646667718887329, + "epoch": 2.01, + "learning_rate": 3.564786861490158e-05, + "loss": 3.203, + "step": 2377, + "task_loss": 2.30611515045166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 3.2585976123809814, + "epoch": 2.01, + "learning_rate": 3.564183069677575e-05, + "loss": 2.6552, + "step": 2378, + "task_loss": 1.1936619281768799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 3.1175904273986816, + "epoch": 2.01, + "learning_rate": 3.5635792778649924e-05, + "loss": 2.8798, + "step": 2379, + "task_loss": 1.9321706295013428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 3.603166103363037, + "epoch": 2.01, + "learning_rate": 3.56297548605241e-05, + "loss": 2.7689, + "step": 2380, + "task_loss": 1.2533173561096191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.8723547458648682, + "epoch": 2.01, + "learning_rate": 3.562371694239826e-05, + "loss": 2.7127, + "step": 2381, + "task_loss": 1.7071744203567505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7016043663024902, + "epoch": 2.01, + "learning_rate": 3.561767902427243e-05, + "loss": 2.1324, + "step": 2382, + "task_loss": 1.450547456741333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.9803593158721924, + "epoch": 2.01, + "learning_rate": 3.5611641106146607e-05, + "loss": 2.2638, + "step": 2383, + "task_loss": 1.4079058170318604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.2916595935821533, + "epoch": 2.02, + "learning_rate": 3.560560318802077e-05, + "loss": 2.1265, + "step": 2384, + "task_loss": 1.376882553100586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.843372344970703, + "epoch": 2.02, + "learning_rate": 3.559956526989494e-05, + "loss": 2.6347, + "step": 2385, + "task_loss": 1.6197419166564941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.6076645851135254, + "epoch": 2.02, + "learning_rate": 3.5593527351769115e-05, + "loss": 2.4356, + "step": 2386, + "task_loss": 1.2698694467544556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5698668956756592, + "epoch": 2.02, + "learning_rate": 3.558748943364328e-05, + "loss": 2.0795, + "step": 2387, + "task_loss": 1.0015205144882202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 3.4925687313079834, + "epoch": 2.02, + "learning_rate": 3.558145151551745e-05, + "loss": 2.3888, + "step": 2388, + "task_loss": 1.4927409887313843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 3.024552345275879, + "epoch": 2.02, + "learning_rate": 3.557541359739162e-05, + "loss": 2.7326, + "step": 2389, + "task_loss": 1.8711820840835571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.1021745204925537, + "epoch": 2.02, + "learning_rate": 3.556937567926579e-05, + "loss": 2.014, + "step": 2390, + "task_loss": 1.9234135150909424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.8932304382324219, + "epoch": 2.02, + "learning_rate": 3.556333776113996e-05, + "loss": 1.925, + "step": 2391, + "task_loss": 0.9768822193145752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.9730640649795532, + "epoch": 2.02, + "learning_rate": 3.555729984301413e-05, + "loss": 2.3824, + "step": 2392, + "task_loss": 1.5152168273925781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7444813251495361, + "epoch": 2.02, + "learning_rate": 3.55512619248883e-05, + "loss": 1.8275, + "step": 2393, + "task_loss": 1.8751744031906128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.674318313598633, + "epoch": 2.02, + "learning_rate": 3.5545224006762466e-05, + "loss": 2.566, + "step": 2394, + "task_loss": 1.9626497030258179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 3.885780096054077, + "epoch": 2.02, + "learning_rate": 3.553918608863664e-05, + "loss": 2.4257, + "step": 2395, + "task_loss": 2.4011831283569336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.688016653060913, + "epoch": 2.03, + "learning_rate": 3.5533148170510814e-05, + "loss": 1.7059, + "step": 2396, + "task_loss": 1.5597035884857178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6211036443710327, + "epoch": 2.03, + "learning_rate": 3.552711025238498e-05, + "loss": 1.5099, + "step": 2397, + "task_loss": 1.206658124923706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7738001346588135, + "epoch": 2.03, + "learning_rate": 3.552107233425915e-05, + "loss": 1.6583, + "step": 2398, + "task_loss": 1.1543632745742798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.8059853315353394, + "epoch": 2.03, + "learning_rate": 3.551503441613332e-05, + "loss": 1.8422, + "step": 2399, + "task_loss": 1.884414792060852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.048482894897461, + "epoch": 2.03, + "learning_rate": 3.550899649800749e-05, + "loss": 2.1733, + "step": 2400, + "task_loss": 1.3014881610870361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.006402015686035, + "epoch": 2.03, + "learning_rate": 3.5502958579881656e-05, + "loss": 2.1443, + "step": 2401, + "task_loss": 1.1606464385986328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.1068930625915527, + "epoch": 2.03, + "learning_rate": 3.549692066175583e-05, + "loss": 1.8529, + "step": 2402, + "task_loss": 2.0201778411865234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.111406922340393, + "epoch": 2.03, + "learning_rate": 3.549088274363e-05, + "loss": 1.5961, + "step": 2403, + "task_loss": 0.9130090475082397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4888978004455566, + "epoch": 2.03, + "learning_rate": 3.5484844825504165e-05, + "loss": 1.8803, + "step": 2404, + "task_loss": 1.5416336059570312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7527201175689697, + "epoch": 2.03, + "learning_rate": 3.547880690737834e-05, + "loss": 2.0179, + "step": 2405, + "task_loss": 0.8192425966262817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1548995971679688, + "epoch": 2.03, + "learning_rate": 3.5472768989252506e-05, + "loss": 1.161, + "step": 2406, + "task_loss": 0.5419263243675232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.169149398803711, + "epoch": 2.03, + "learning_rate": 3.546673107112668e-05, + "loss": 1.6494, + "step": 2407, + "task_loss": 0.2783471643924713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1632423400878906, + "epoch": 2.04, + "learning_rate": 3.546069315300085e-05, + "loss": 1.1968, + "step": 2408, + "task_loss": 1.829977035522461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4465243816375732, + "epoch": 2.04, + "learning_rate": 3.5454655234875014e-05, + "loss": 1.5152, + "step": 2409, + "task_loss": 1.3301951885223389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1828058958053589, + "epoch": 2.04, + "learning_rate": 3.544861731674919e-05, + "loss": 1.5683, + "step": 2410, + "task_loss": 0.5418788194656372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6635905504226685, + "epoch": 2.04, + "learning_rate": 3.5442579398623355e-05, + "loss": 1.5841, + "step": 2411, + "task_loss": 1.256627082824707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2581639289855957, + "epoch": 2.04, + "learning_rate": 3.543654148049753e-05, + "loss": 1.7816, + "step": 2412, + "task_loss": 1.2460963726043701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1339378356933594, + "epoch": 2.04, + "learning_rate": 3.5430503562371697e-05, + "loss": 1.5445, + "step": 2413, + "task_loss": 0.8661832213401794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0598548650741577, + "epoch": 2.04, + "learning_rate": 3.5424465644245864e-05, + "loss": 0.9845, + "step": 2414, + "task_loss": 0.7637588381767273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.230006694793701, + "epoch": 2.04, + "learning_rate": 3.541842772612004e-05, + "loss": 1.7225, + "step": 2415, + "task_loss": 1.293196439743042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1976304054260254, + "epoch": 2.04, + "learning_rate": 3.5412389807994205e-05, + "loss": 1.4816, + "step": 2416, + "task_loss": 0.8857174515724182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.113510012626648, + "epoch": 2.04, + "learning_rate": 3.540635188986838e-05, + "loss": 1.6252, + "step": 2417, + "task_loss": 0.7524394989013672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.653610348701477, + "epoch": 2.04, + "learning_rate": 3.5400313971742546e-05, + "loss": 1.4815, + "step": 2418, + "task_loss": 0.8934035301208496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.234905481338501, + "epoch": 2.04, + "learning_rate": 3.539427605361671e-05, + "loss": 1.9128, + "step": 2419, + "task_loss": 0.856835126876831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.098511815071106, + "epoch": 2.05, + "learning_rate": 3.538823813549089e-05, + "loss": 1.418, + "step": 2420, + "task_loss": 0.6759884357452393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7596487998962402, + "epoch": 2.05, + "learning_rate": 3.5382200217365054e-05, + "loss": 1.7011, + "step": 2421, + "task_loss": 1.2066035270690918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.4780449867248535, + "epoch": 2.05, + "learning_rate": 3.537616229923922e-05, + "loss": 1.7158, + "step": 2422, + "task_loss": 2.2709736824035645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5654544830322266, + "epoch": 2.05, + "learning_rate": 3.5370124381113395e-05, + "loss": 1.6715, + "step": 2423, + "task_loss": 1.018605351448059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.8508186340332031, + "epoch": 2.05, + "learning_rate": 3.536408646298756e-05, + "loss": 1.9541, + "step": 2424, + "task_loss": 0.8859961032867432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.0176711082458496, + "epoch": 2.05, + "learning_rate": 3.535804854486173e-05, + "loss": 1.8051, + "step": 2425, + "task_loss": 1.6450481414794922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.266354560852051, + "epoch": 2.05, + "learning_rate": 3.5352010626735904e-05, + "loss": 1.8908, + "step": 2426, + "task_loss": 2.2552289962768555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2633095979690552, + "epoch": 2.05, + "learning_rate": 3.534597270861008e-05, + "loss": 1.1239, + "step": 2427, + "task_loss": 1.139604091644287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8411358594894409, + "epoch": 2.05, + "learning_rate": 3.533993479048424e-05, + "loss": 1.6262, + "step": 2428, + "task_loss": 0.5337929725646973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.513352155685425, + "epoch": 2.05, + "learning_rate": 3.533389687235841e-05, + "loss": 1.7072, + "step": 2429, + "task_loss": 1.131248116493225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.76312255859375, + "epoch": 2.05, + "learning_rate": 3.5327858954232586e-05, + "loss": 1.7512, + "step": 2430, + "task_loss": 1.7146672010421753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.882275402545929, + "epoch": 2.05, + "learning_rate": 3.532182103610675e-05, + "loss": 1.369, + "step": 2431, + "task_loss": 0.7552844285964966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3863966464996338, + "epoch": 2.06, + "learning_rate": 3.531578311798092e-05, + "loss": 1.6122, + "step": 2432, + "task_loss": 1.0116581916809082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.161930561065674, + "epoch": 2.06, + "learning_rate": 3.5309745199855094e-05, + "loss": 1.5809, + "step": 2433, + "task_loss": 1.4254279136657715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9569919109344482, + "epoch": 2.06, + "learning_rate": 3.530370728172926e-05, + "loss": 1.3159, + "step": 2434, + "task_loss": 1.2949148416519165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.1029558181762695, + "epoch": 2.06, + "learning_rate": 3.529766936360343e-05, + "loss": 1.5769, + "step": 2435, + "task_loss": 1.9820683002471924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.9795022010803223, + "epoch": 2.06, + "learning_rate": 3.52916314454776e-05, + "loss": 1.7308, + "step": 2436, + "task_loss": 1.5297834873199463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.176500678062439, + "epoch": 2.06, + "learning_rate": 3.528559352735178e-05, + "loss": 1.5442, + "step": 2437, + "task_loss": 0.5777260065078735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1056512594223022, + "epoch": 2.06, + "learning_rate": 3.527955560922594e-05, + "loss": 1.6704, + "step": 2438, + "task_loss": 1.3359051942825317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.409621000289917, + "epoch": 2.06, + "learning_rate": 3.527351769110011e-05, + "loss": 1.6319, + "step": 2439, + "task_loss": 0.8464250564575195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.383544921875, + "epoch": 2.06, + "learning_rate": 3.5267479772974285e-05, + "loss": 1.59, + "step": 2440, + "task_loss": 1.6564099788665771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6310456991195679, + "epoch": 2.06, + "learning_rate": 3.5261441854848445e-05, + "loss": 1.3342, + "step": 2441, + "task_loss": 1.016051173210144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.312596321105957, + "epoch": 2.06, + "learning_rate": 3.525540393672262e-05, + "loss": 1.0807, + "step": 2442, + "task_loss": 0.5171737670898438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.663266181945801, + "epoch": 2.07, + "learning_rate": 3.524936601859679e-05, + "loss": 1.527, + "step": 2443, + "task_loss": 2.172339677810669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7688603401184082, + "epoch": 2.07, + "learning_rate": 3.5243328100470954e-05, + "loss": 1.4229, + "step": 2444, + "task_loss": 1.9007397890090942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3331525325775146, + "epoch": 2.07, + "learning_rate": 3.523729018234513e-05, + "loss": 1.5247, + "step": 2445, + "task_loss": 1.0954235792160034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3711392879486084, + "epoch": 2.07, + "learning_rate": 3.52312522642193e-05, + "loss": 1.4729, + "step": 2446, + "task_loss": 1.2132072448730469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2634410858154297, + "epoch": 2.07, + "learning_rate": 3.522521434609347e-05, + "loss": 1.2304, + "step": 2447, + "task_loss": 0.6916882395744324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.019486904144287, + "epoch": 2.07, + "learning_rate": 3.5219176427967636e-05, + "loss": 1.7764, + "step": 2448, + "task_loss": 1.7461954355239868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.407402515411377, + "epoch": 2.07, + "learning_rate": 3.521313850984181e-05, + "loss": 1.0428, + "step": 2449, + "task_loss": 1.012795329093933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9032294750213623, + "epoch": 2.07, + "learning_rate": 3.520710059171598e-05, + "loss": 1.2082, + "step": 2450, + "task_loss": 0.12767401337623596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3116698265075684, + "epoch": 2.07, + "learning_rate": 3.5201062673590144e-05, + "loss": 1.259, + "step": 2451, + "task_loss": 0.6800302267074585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.600150227546692, + "epoch": 2.07, + "learning_rate": 3.519502475546432e-05, + "loss": 1.3536, + "step": 2452, + "task_loss": 1.9611575603485107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6851948499679565, + "epoch": 2.07, + "learning_rate": 3.518898683733849e-05, + "loss": 1.6067, + "step": 2453, + "task_loss": 1.4527864456176758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6482417583465576, + "epoch": 2.07, + "learning_rate": 3.518294891921265e-05, + "loss": 1.4196, + "step": 2454, + "task_loss": 0.9126148819923401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1837124824523926, + "epoch": 2.08, + "learning_rate": 3.517691100108683e-05, + "loss": 1.482, + "step": 2455, + "task_loss": 1.0458905696868896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.624044418334961, + "epoch": 2.08, + "learning_rate": 3.5170873082961e-05, + "loss": 1.3499, + "step": 2456, + "task_loss": 1.2754698991775513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7467665672302246, + "epoch": 2.08, + "learning_rate": 3.516483516483517e-05, + "loss": 1.2571, + "step": 2457, + "task_loss": 1.8186559677124023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.71449875831604, + "epoch": 2.08, + "learning_rate": 3.5158797246709335e-05, + "loss": 1.2227, + "step": 2458, + "task_loss": 1.1837830543518066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2735698223114014, + "epoch": 2.08, + "learning_rate": 3.515275932858351e-05, + "loss": 1.2175, + "step": 2459, + "task_loss": 1.2002431154251099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.405210256576538, + "epoch": 2.08, + "learning_rate": 3.5146721410457676e-05, + "loss": 1.5672, + "step": 2460, + "task_loss": 1.0881227254867554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3665781021118164, + "epoch": 2.08, + "learning_rate": 3.514068349233184e-05, + "loss": 1.3352, + "step": 2461, + "task_loss": 1.4953579902648926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.41208016872406, + "epoch": 2.08, + "learning_rate": 3.513464557420602e-05, + "loss": 1.3782, + "step": 2462, + "task_loss": 0.352135568857193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1963071823120117, + "epoch": 2.08, + "learning_rate": 3.5128607656080184e-05, + "loss": 1.8499, + "step": 2463, + "task_loss": 0.4583376348018646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.74224853515625, + "epoch": 2.08, + "learning_rate": 3.512256973795435e-05, + "loss": 1.3288, + "step": 2464, + "task_loss": 1.064825415611267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2254512310028076, + "epoch": 2.08, + "learning_rate": 3.5116531819828526e-05, + "loss": 1.4212, + "step": 2465, + "task_loss": 1.3030978441238403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5794579982757568, + "epoch": 2.08, + "learning_rate": 3.511049390170269e-05, + "loss": 1.3463, + "step": 2466, + "task_loss": 1.7503514289855957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1074600219726562, + "epoch": 2.09, + "learning_rate": 3.510445598357686e-05, + "loss": 1.2721, + "step": 2467, + "task_loss": 0.8048511743545532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6260517835617065, + "epoch": 2.09, + "learning_rate": 3.5098418065451034e-05, + "loss": 1.2349, + "step": 2468, + "task_loss": 1.097236156463623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2198338508605957, + "epoch": 2.09, + "learning_rate": 3.509238014732521e-05, + "loss": 1.2554, + "step": 2469, + "task_loss": 1.596426248550415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3188869953155518, + "epoch": 2.09, + "learning_rate": 3.5086342229199375e-05, + "loss": 1.1693, + "step": 2470, + "task_loss": 1.3935199975967407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9356237649917603, + "epoch": 2.09, + "learning_rate": 3.508030431107354e-05, + "loss": 1.138, + "step": 2471, + "task_loss": 0.9571179151535034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9884477257728577, + "epoch": 2.09, + "learning_rate": 3.5074266392947716e-05, + "loss": 1.3923, + "step": 2472, + "task_loss": 1.318662166595459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4606609344482422, + "epoch": 2.09, + "learning_rate": 3.506822847482188e-05, + "loss": 1.4305, + "step": 2473, + "task_loss": 0.7601984143257141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.185983419418335, + "epoch": 2.09, + "learning_rate": 3.506219055669605e-05, + "loss": 1.6359, + "step": 2474, + "task_loss": 1.1885576248168945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4593037366867065, + "epoch": 2.09, + "learning_rate": 3.5056152638570225e-05, + "loss": 1.5689, + "step": 2475, + "task_loss": 0.5814553499221802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3374162912368774, + "epoch": 2.09, + "learning_rate": 3.505011472044439e-05, + "loss": 1.1298, + "step": 2476, + "task_loss": 0.6473168730735779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.2450132369995117, + "epoch": 2.09, + "learning_rate": 3.504407680231856e-05, + "loss": 1.2661, + "step": 2477, + "task_loss": 2.231135368347168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.686068058013916, + "epoch": 2.09, + "learning_rate": 3.503803888419273e-05, + "loss": 1.386, + "step": 2478, + "task_loss": 1.2415560483932495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7942306995391846, + "epoch": 2.1, + "learning_rate": 3.50320009660669e-05, + "loss": 1.2721, + "step": 2479, + "task_loss": 0.7026255130767822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4284751415252686, + "epoch": 2.1, + "learning_rate": 3.5025963047941074e-05, + "loss": 1.4027, + "step": 2480, + "task_loss": 0.561096727848053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1545677185058594, + "epoch": 2.1, + "learning_rate": 3.501992512981524e-05, + "loss": 1.3513, + "step": 2481, + "task_loss": 1.1893587112426758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0366015434265137, + "epoch": 2.1, + "learning_rate": 3.501388721168941e-05, + "loss": 1.3185, + "step": 2482, + "task_loss": 0.5875089764595032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0594216585159302, + "epoch": 2.1, + "learning_rate": 3.500784929356358e-05, + "loss": 1.0303, + "step": 2483, + "task_loss": 0.5982950329780579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2622240781784058, + "epoch": 2.1, + "learning_rate": 3.500181137543775e-05, + "loss": 1.1256, + "step": 2484, + "task_loss": 1.3893510103225708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8780015707015991, + "epoch": 2.1, + "learning_rate": 3.499577345731192e-05, + "loss": 1.3606, + "step": 2485, + "task_loss": 0.7664692401885986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5420197248458862, + "epoch": 2.1, + "learning_rate": 3.498973553918609e-05, + "loss": 1.1911, + "step": 2486, + "task_loss": 1.314092993736267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.0764689445495605, + "epoch": 2.1, + "learning_rate": 3.498369762106026e-05, + "loss": 1.3682, + "step": 2487, + "task_loss": 2.0281178951263428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 3.4595751762390137, + "epoch": 2.1, + "learning_rate": 3.497765970293443e-05, + "loss": 1.9426, + "step": 2488, + "task_loss": 3.259431838989258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6280409097671509, + "epoch": 2.1, + "learning_rate": 3.49716217848086e-05, + "loss": 1.2463, + "step": 2489, + "task_loss": 2.0687222480773926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4340925216674805, + "epoch": 2.1, + "learning_rate": 3.496558386668277e-05, + "loss": 1.1063, + "step": 2490, + "task_loss": 0.8355008363723755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.973612368106842, + "epoch": 2.11, + "learning_rate": 3.495954594855694e-05, + "loss": 1.278, + "step": 2491, + "task_loss": 1.3828065395355225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3168693780899048, + "epoch": 2.11, + "learning_rate": 3.495350803043111e-05, + "loss": 1.0123, + "step": 2492, + "task_loss": 1.557356834411621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2896207571029663, + "epoch": 2.11, + "learning_rate": 3.494747011230528e-05, + "loss": 1.505, + "step": 2493, + "task_loss": 1.459700107574463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7636563777923584, + "epoch": 2.11, + "learning_rate": 3.494143219417945e-05, + "loss": 1.2366, + "step": 2494, + "task_loss": 1.208235263824463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2116196155548096, + "epoch": 2.11, + "learning_rate": 3.4935394276053616e-05, + "loss": 1.2082, + "step": 2495, + "task_loss": 0.7974462509155273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9458285570144653, + "epoch": 2.11, + "learning_rate": 3.492935635792779e-05, + "loss": 1.2422, + "step": 2496, + "task_loss": 0.4279972016811371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.737341284751892, + "epoch": 2.11, + "learning_rate": 3.492331843980196e-05, + "loss": 1.227, + "step": 2497, + "task_loss": 1.66383957862854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2319836616516113, + "epoch": 2.11, + "learning_rate": 3.4917280521676124e-05, + "loss": 1.1503, + "step": 2498, + "task_loss": 2.216526985168457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.143429756164551, + "epoch": 2.11, + "learning_rate": 3.49112426035503e-05, + "loss": 1.2892, + "step": 2499, + "task_loss": 1.2699668407440186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.309948444366455, + "epoch": 2.11, + "learning_rate": 3.490520468542447e-05, + "loss": 1.2272, + "step": 2500, + "task_loss": 0.486138254404068 + }, + { + "epoch": 2.11, + "eval_accuracy": 0.8563564356435643, + "eval_loss": 0.7333118319511414, + "eval_runtime": 229.2574, + "eval_samples_per_second": 110.138, + "eval_steps_per_second": 0.864, + "step": 2500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7053568363189697, + "epoch": 2.11, + "learning_rate": 3.489916676729863e-05, + "loss": 0.7985, + "step": 2501, + "task_loss": 1.072098970413208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6546218395233154, + "epoch": 2.11, + "learning_rate": 3.4893128849172806e-05, + "loss": 1.3306, + "step": 2502, + "task_loss": 1.0386290550231934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9856660962104797, + "epoch": 2.12, + "learning_rate": 3.488709093104698e-05, + "loss": 1.1253, + "step": 2503, + "task_loss": 0.730956494808197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4524785280227661, + "epoch": 2.12, + "learning_rate": 3.488105301292115e-05, + "loss": 1.2871, + "step": 2504, + "task_loss": 0.6943610906600952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3152192831039429, + "epoch": 2.12, + "learning_rate": 3.4875015094795315e-05, + "loss": 1.225, + "step": 2505, + "task_loss": 1.2128413915634155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3625967502593994, + "epoch": 2.12, + "learning_rate": 3.486897717666949e-05, + "loss": 1.35, + "step": 2506, + "task_loss": 1.2023200988769531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6762778759002686, + "epoch": 2.12, + "learning_rate": 3.4862939258543656e-05, + "loss": 1.2491, + "step": 2507, + "task_loss": 0.7340527772903442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0951451063156128, + "epoch": 2.12, + "learning_rate": 3.485690134041782e-05, + "loss": 1.1814, + "step": 2508, + "task_loss": 1.0627542734146118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3382368087768555, + "epoch": 2.12, + "learning_rate": 3.4850863422292e-05, + "loss": 1.4205, + "step": 2509, + "task_loss": 1.3101381063461304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.234015703201294, + "epoch": 2.12, + "learning_rate": 3.484482550416617e-05, + "loss": 1.0078, + "step": 2510, + "task_loss": 0.5215551853179932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4782711267471313, + "epoch": 2.12, + "learning_rate": 3.483878758604033e-05, + "loss": 1.5511, + "step": 2511, + "task_loss": 1.44635009765625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8828892111778259, + "epoch": 2.12, + "learning_rate": 3.4832749667914505e-05, + "loss": 1.454, + "step": 2512, + "task_loss": 1.097953200340271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.011842966079712, + "epoch": 2.12, + "learning_rate": 3.482671174978868e-05, + "loss": 0.9572, + "step": 2513, + "task_loss": 0.5319902300834656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5200048685073853, + "epoch": 2.13, + "learning_rate": 3.482067383166284e-05, + "loss": 1.3172, + "step": 2514, + "task_loss": 0.8237355351448059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5037763118743896, + "epoch": 2.13, + "learning_rate": 3.4814635913537013e-05, + "loss": 1.5082, + "step": 2515, + "task_loss": 1.2203800678253174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3670780658721924, + "epoch": 2.13, + "learning_rate": 3.480859799541119e-05, + "loss": 1.0011, + "step": 2516, + "task_loss": 1.9772387742996216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1681078672409058, + "epoch": 2.13, + "learning_rate": 3.480256007728535e-05, + "loss": 0.8712, + "step": 2517, + "task_loss": 1.332604169845581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0971417427062988, + "epoch": 2.13, + "learning_rate": 3.479652215915952e-05, + "loss": 1.1381, + "step": 2518, + "task_loss": 0.4346051812171936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5040347576141357, + "epoch": 2.13, + "learning_rate": 3.4790484241033696e-05, + "loss": 1.1683, + "step": 2519, + "task_loss": 1.064577579498291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3124432563781738, + "epoch": 2.13, + "learning_rate": 3.478444632290786e-05, + "loss": 1.2684, + "step": 2520, + "task_loss": 0.8021447062492371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3003246784210205, + "epoch": 2.13, + "learning_rate": 3.477840840478203e-05, + "loss": 1.3377, + "step": 2521, + "task_loss": 1.6360363960266113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9612334966659546, + "epoch": 2.13, + "learning_rate": 3.4772370486656204e-05, + "loss": 1.1334, + "step": 2522, + "task_loss": 1.5336648225784302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.872597336769104, + "epoch": 2.13, + "learning_rate": 3.476633256853037e-05, + "loss": 1.0612, + "step": 2523, + "task_loss": 0.4167312681674957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0671679973602295, + "epoch": 2.13, + "learning_rate": 3.476029465040454e-05, + "loss": 1.1471, + "step": 2524, + "task_loss": 0.9894512891769409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.099616050720215, + "epoch": 2.13, + "learning_rate": 3.475425673227871e-05, + "loss": 1.2886, + "step": 2525, + "task_loss": 1.6126190423965454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.053142786026001, + "epoch": 2.14, + "learning_rate": 3.4748218814152886e-05, + "loss": 1.1567, + "step": 2526, + "task_loss": 0.8060246109962463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0318787097930908, + "epoch": 2.14, + "learning_rate": 3.474218089602705e-05, + "loss": 1.1091, + "step": 2527, + "task_loss": 0.7580288648605347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3646152019500732, + "epoch": 2.14, + "learning_rate": 3.473614297790122e-05, + "loss": 1.186, + "step": 2528, + "task_loss": 1.4335401058197021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4853459596633911, + "epoch": 2.14, + "learning_rate": 3.4730105059775395e-05, + "loss": 1.1266, + "step": 2529, + "task_loss": 1.4229718446731567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1696252822875977, + "epoch": 2.14, + "learning_rate": 3.472406714164956e-05, + "loss": 1.2161, + "step": 2530, + "task_loss": 1.6826777458190918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.8503031730651855, + "epoch": 2.14, + "learning_rate": 3.471802922352373e-05, + "loss": 1.4453, + "step": 2531, + "task_loss": 2.1888749599456787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.9229843616485596, + "epoch": 2.14, + "learning_rate": 3.47119913053979e-05, + "loss": 1.1687, + "step": 2532, + "task_loss": 1.2211486101150513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8921827077865601, + "epoch": 2.14, + "learning_rate": 3.470595338727207e-05, + "loss": 1.0006, + "step": 2533, + "task_loss": 2.0542569160461426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1033704280853271, + "epoch": 2.14, + "learning_rate": 3.469991546914624e-05, + "loss": 1.194, + "step": 2534, + "task_loss": 1.0243548154830933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.020796298980713, + "epoch": 2.14, + "learning_rate": 3.469387755102041e-05, + "loss": 1.2704, + "step": 2535, + "task_loss": 1.225877046585083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7907130718231201, + "epoch": 2.14, + "learning_rate": 3.468783963289458e-05, + "loss": 1.2552, + "step": 2536, + "task_loss": 1.569359302520752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2219887971878052, + "epoch": 2.14, + "learning_rate": 3.4681801714768746e-05, + "loss": 1.1189, + "step": 2537, + "task_loss": 1.163564682006836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8630379438400269, + "epoch": 2.15, + "learning_rate": 3.467576379664292e-05, + "loss": 1.1174, + "step": 2538, + "task_loss": 1.1075502634048462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.072967529296875, + "epoch": 2.15, + "learning_rate": 3.466972587851709e-05, + "loss": 1.1927, + "step": 2539, + "task_loss": 0.6020919680595398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9529368877410889, + "epoch": 2.15, + "learning_rate": 3.466368796039126e-05, + "loss": 1.0605, + "step": 2540, + "task_loss": 0.6872215867042542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5466176271438599, + "epoch": 2.15, + "learning_rate": 3.465765004226543e-05, + "loss": 1.3216, + "step": 2541, + "task_loss": 1.284846544265747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0043617486953735, + "epoch": 2.15, + "learning_rate": 3.4651612124139595e-05, + "loss": 1.0973, + "step": 2542, + "task_loss": 1.4613919258117676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8847873210906982, + "epoch": 2.15, + "learning_rate": 3.464557420601377e-05, + "loss": 1.2071, + "step": 2543, + "task_loss": 1.0765708684921265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9613034129142761, + "epoch": 2.15, + "learning_rate": 3.4639536287887936e-05, + "loss": 1.1469, + "step": 2544, + "task_loss": 0.5484113693237305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.166001319885254, + "epoch": 2.15, + "learning_rate": 3.463349836976211e-05, + "loss": 1.1228, + "step": 2545, + "task_loss": 1.4188872575759888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3129665851593018, + "epoch": 2.15, + "learning_rate": 3.462746045163628e-05, + "loss": 1.1087, + "step": 2546, + "task_loss": 1.3805090188980103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1447150707244873, + "epoch": 2.15, + "learning_rate": 3.4621422533510445e-05, + "loss": 1.0386, + "step": 2547, + "task_loss": 1.035521388053894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6487390995025635, + "epoch": 2.15, + "learning_rate": 3.461538461538462e-05, + "loss": 0.8599, + "step": 2548, + "task_loss": 0.1829056739807129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3855171203613281, + "epoch": 2.15, + "learning_rate": 3.4609346697258786e-05, + "loss": 1.0213, + "step": 2549, + "task_loss": 1.4720946550369263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1885156631469727, + "epoch": 2.16, + "learning_rate": 3.460330877913296e-05, + "loss": 1.2611, + "step": 2550, + "task_loss": 1.2164133787155151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9729989171028137, + "epoch": 2.16, + "learning_rate": 3.459727086100713e-05, + "loss": 1.0546, + "step": 2551, + "task_loss": 0.7820262312889099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.442335605621338, + "epoch": 2.16, + "learning_rate": 3.4591232942881294e-05, + "loss": 1.2985, + "step": 2552, + "task_loss": 1.735497236251831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9396524429321289, + "epoch": 2.16, + "learning_rate": 3.458519502475547e-05, + "loss": 0.7632, + "step": 2553, + "task_loss": 1.0489277839660645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2471200227737427, + "epoch": 2.16, + "learning_rate": 3.4579157106629635e-05, + "loss": 0.9683, + "step": 2554, + "task_loss": 0.4694829285144806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0390191078186035, + "epoch": 2.16, + "learning_rate": 3.45731191885038e-05, + "loss": 1.1179, + "step": 2555, + "task_loss": 1.3423452377319336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1110248565673828, + "epoch": 2.16, + "learning_rate": 3.4567081270377976e-05, + "loss": 1.1611, + "step": 2556, + "task_loss": 1.9497874975204468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9859863519668579, + "epoch": 2.16, + "learning_rate": 3.4561043352252144e-05, + "loss": 1.1326, + "step": 2557, + "task_loss": 1.3359191417694092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.510878086090088, + "epoch": 2.16, + "learning_rate": 3.455500543412631e-05, + "loss": 0.97, + "step": 2558, + "task_loss": 0.9846051931381226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6238220930099487, + "epoch": 2.16, + "learning_rate": 3.4548967516000485e-05, + "loss": 1.2379, + "step": 2559, + "task_loss": 2.1146891117095947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2534136772155762, + "epoch": 2.16, + "learning_rate": 3.454292959787466e-05, + "loss": 1.3739, + "step": 2560, + "task_loss": 0.6322780251502991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3101569414138794, + "epoch": 2.16, + "learning_rate": 3.4536891679748826e-05, + "loss": 0.7611, + "step": 2561, + "task_loss": 0.6386892795562744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.358349084854126, + "epoch": 2.17, + "learning_rate": 3.453085376162299e-05, + "loss": 1.4254, + "step": 2562, + "task_loss": 1.1581001281738281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4563337564468384, + "epoch": 2.17, + "learning_rate": 3.452481584349717e-05, + "loss": 1.005, + "step": 2563, + "task_loss": 0.8354026675224304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9888055324554443, + "epoch": 2.17, + "learning_rate": 3.4518777925371334e-05, + "loss": 1.0262, + "step": 2564, + "task_loss": 1.3684767484664917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8452444076538086, + "epoch": 2.17, + "learning_rate": 3.45127400072455e-05, + "loss": 1.009, + "step": 2565, + "task_loss": 0.41719377040863037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.734812617301941, + "epoch": 2.17, + "learning_rate": 3.4506702089119675e-05, + "loss": 1.1615, + "step": 2566, + "task_loss": 1.3667136430740356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8321875333786011, + "epoch": 2.17, + "learning_rate": 3.450066417099384e-05, + "loss": 1.1637, + "step": 2567, + "task_loss": 0.6410722136497498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.958520770072937, + "epoch": 2.17, + "learning_rate": 3.449462625286801e-05, + "loss": 0.8706, + "step": 2568, + "task_loss": 0.36951157450675964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7271863222122192, + "epoch": 2.17, + "learning_rate": 3.4488588334742184e-05, + "loss": 1.0141, + "step": 2569, + "task_loss": 0.6958513259887695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6534376740455627, + "epoch": 2.17, + "learning_rate": 3.448255041661636e-05, + "loss": 0.8756, + "step": 2570, + "task_loss": 0.19065265357494354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.104745864868164, + "epoch": 2.17, + "learning_rate": 3.447651249849052e-05, + "loss": 1.1397, + "step": 2571, + "task_loss": 1.342674970626831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2388222217559814, + "epoch": 2.17, + "learning_rate": 3.447047458036469e-05, + "loss": 0.8099, + "step": 2572, + "task_loss": 1.2349869012832642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7347530722618103, + "epoch": 2.17, + "learning_rate": 3.4464436662238866e-05, + "loss": 0.8695, + "step": 2573, + "task_loss": 0.7151706218719482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0751478672027588, + "epoch": 2.18, + "learning_rate": 3.4458398744113026e-05, + "loss": 1.1783, + "step": 2574, + "task_loss": 1.4406627416610718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1418249607086182, + "epoch": 2.18, + "learning_rate": 3.44523608259872e-05, + "loss": 1.0347, + "step": 2575, + "task_loss": 0.8527354598045349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1142523288726807, + "epoch": 2.18, + "learning_rate": 3.4446322907861374e-05, + "loss": 1.11, + "step": 2576, + "task_loss": 1.6877604722976685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.708667516708374, + "epoch": 2.18, + "learning_rate": 3.444028498973554e-05, + "loss": 1.0734, + "step": 2577, + "task_loss": 0.6796867847442627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8069379329681396, + "epoch": 2.18, + "learning_rate": 3.443424707160971e-05, + "loss": 0.6966, + "step": 2578, + "task_loss": 1.3817098140716553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1646077632904053, + "epoch": 2.18, + "learning_rate": 3.442820915348388e-05, + "loss": 1.2992, + "step": 2579, + "task_loss": 1.5151630640029907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.448328971862793, + "epoch": 2.18, + "learning_rate": 3.442217123535805e-05, + "loss": 1.2454, + "step": 2580, + "task_loss": 0.5856110453605652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8423644304275513, + "epoch": 2.18, + "learning_rate": 3.441613331723222e-05, + "loss": 1.0249, + "step": 2581, + "task_loss": 0.3457372784614563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0610859394073486, + "epoch": 2.18, + "learning_rate": 3.441009539910639e-05, + "loss": 1.0793, + "step": 2582, + "task_loss": 0.475426584482193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.89096599817276, + "epoch": 2.18, + "learning_rate": 3.4404057480980565e-05, + "loss": 0.8534, + "step": 2583, + "task_loss": 1.1937633752822876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.086619257926941, + "epoch": 2.18, + "learning_rate": 3.4398019562854725e-05, + "loss": 1.0219, + "step": 2584, + "task_loss": 1.3101277351379395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0598491430282593, + "epoch": 2.19, + "learning_rate": 3.43919816447289e-05, + "loss": 0.9312, + "step": 2585, + "task_loss": 0.2732967138290405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6186603903770447, + "epoch": 2.19, + "learning_rate": 3.438594372660307e-05, + "loss": 0.7945, + "step": 2586, + "task_loss": 0.7435577511787415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8667780160903931, + "epoch": 2.19, + "learning_rate": 3.4379905808477234e-05, + "loss": 0.8379, + "step": 2587, + "task_loss": 0.9519128799438477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8923795223236084, + "epoch": 2.19, + "learning_rate": 3.437386789035141e-05, + "loss": 0.8693, + "step": 2588, + "task_loss": 0.9834550619125366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9654223322868347, + "epoch": 2.19, + "learning_rate": 3.436782997222558e-05, + "loss": 1.0249, + "step": 2589, + "task_loss": 2.0153212547302246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.295036792755127, + "epoch": 2.19, + "learning_rate": 3.436179205409975e-05, + "loss": 1.467, + "step": 2590, + "task_loss": 1.553170919418335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1218854188919067, + "epoch": 2.19, + "learning_rate": 3.4355754135973916e-05, + "loss": 0.8051, + "step": 2591, + "task_loss": 1.0275639295578003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1406588554382324, + "epoch": 2.19, + "learning_rate": 3.434971621784809e-05, + "loss": 0.9485, + "step": 2592, + "task_loss": 1.1052346229553223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.987655758857727, + "epoch": 2.19, + "learning_rate": 3.434367829972226e-05, + "loss": 0.7353, + "step": 2593, + "task_loss": 0.5401403903961182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5932595729827881, + "epoch": 2.19, + "learning_rate": 3.4337640381596424e-05, + "loss": 1.0469, + "step": 2594, + "task_loss": 1.0760095119476318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.079272747039795, + "epoch": 2.19, + "learning_rate": 3.43316024634706e-05, + "loss": 0.952, + "step": 2595, + "task_loss": 0.9535637497901917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7089083194732666, + "epoch": 2.19, + "learning_rate": 3.4325564545344765e-05, + "loss": 1.0304, + "step": 2596, + "task_loss": 1.0816075801849365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9977084398269653, + "epoch": 2.2, + "learning_rate": 3.431952662721893e-05, + "loss": 1.1712, + "step": 2597, + "task_loss": 0.5775997042655945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6227670907974243, + "epoch": 2.2, + "learning_rate": 3.4313488709093107e-05, + "loss": 1.2516, + "step": 2598, + "task_loss": 1.1307307481765747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4746872186660767, + "epoch": 2.2, + "learning_rate": 3.430745079096728e-05, + "loss": 1.066, + "step": 2599, + "task_loss": 2.412060260772705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8571417331695557, + "epoch": 2.2, + "learning_rate": 3.430141287284145e-05, + "loss": 0.7375, + "step": 2600, + "task_loss": 1.036384105682373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8657087683677673, + "epoch": 2.2, + "learning_rate": 3.4295374954715615e-05, + "loss": 0.9436, + "step": 2601, + "task_loss": 0.8357915878295898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2171831130981445, + "epoch": 2.2, + "learning_rate": 3.428933703658979e-05, + "loss": 1.0872, + "step": 2602, + "task_loss": 0.8800215125083923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0127038955688477, + "epoch": 2.2, + "learning_rate": 3.4283299118463956e-05, + "loss": 1.0037, + "step": 2603, + "task_loss": 1.4417030811309814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7587880492210388, + "epoch": 2.2, + "learning_rate": 3.427726120033812e-05, + "loss": 1.14, + "step": 2604, + "task_loss": 1.0590839385986328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 2.2289013862609863, + "epoch": 2.2, + "learning_rate": 3.42712232822123e-05, + "loss": 1.1523, + "step": 2605, + "task_loss": 1.0584945678710938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7854428291320801, + "epoch": 2.2, + "learning_rate": 3.4265185364086464e-05, + "loss": 0.8291, + "step": 2606, + "task_loss": 0.9879540801048279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2266194820404053, + "epoch": 2.2, + "learning_rate": 3.425914744596063e-05, + "loss": 0.9044, + "step": 2607, + "task_loss": 1.2221444845199585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3500730991363525, + "epoch": 2.2, + "learning_rate": 3.4253109527834805e-05, + "loss": 1.1623, + "step": 2608, + "task_loss": 1.3208087682724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1848549842834473, + "epoch": 2.21, + "learning_rate": 3.424707160970897e-05, + "loss": 1.2707, + "step": 2609, + "task_loss": 0.9342983961105347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.279867172241211, + "epoch": 2.21, + "learning_rate": 3.4241033691583147e-05, + "loss": 0.6991, + "step": 2610, + "task_loss": 0.3087937533855438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.947964072227478, + "epoch": 2.21, + "learning_rate": 3.4234995773457314e-05, + "loss": 1.1283, + "step": 2611, + "task_loss": 2.706888437271118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0098567008972168, + "epoch": 2.21, + "learning_rate": 3.422895785533148e-05, + "loss": 1.3186, + "step": 2612, + "task_loss": 0.917197048664093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.372505784034729, + "epoch": 2.21, + "learning_rate": 3.4222919937205655e-05, + "loss": 1.371, + "step": 2613, + "task_loss": 0.5588114857673645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5519673824310303, + "epoch": 2.21, + "learning_rate": 3.421688201907982e-05, + "loss": 1.155, + "step": 2614, + "task_loss": 0.17837344110012054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8124719858169556, + "epoch": 2.21, + "learning_rate": 3.421084410095399e-05, + "loss": 0.9052, + "step": 2615, + "task_loss": 1.274876594543457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9510629177093506, + "epoch": 2.21, + "learning_rate": 3.420480618282816e-05, + "loss": 1.2591, + "step": 2616, + "task_loss": 1.456710696220398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7944259643554688, + "epoch": 2.21, + "learning_rate": 3.419876826470233e-05, + "loss": 1.028, + "step": 2617, + "task_loss": 1.876255989074707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1987496614456177, + "epoch": 2.21, + "learning_rate": 3.4192730346576504e-05, + "loss": 1.1439, + "step": 2618, + "task_loss": 0.5110378861427307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1718367338180542, + "epoch": 2.21, + "learning_rate": 3.418669242845067e-05, + "loss": 0.9125, + "step": 2619, + "task_loss": 1.310496211051941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9535084366798401, + "epoch": 2.21, + "learning_rate": 3.4180654510324846e-05, + "loss": 1.0658, + "step": 2620, + "task_loss": 1.4184926748275757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9133282899856567, + "epoch": 2.22, + "learning_rate": 3.417461659219901e-05, + "loss": 0.9009, + "step": 2621, + "task_loss": 0.5645947456359863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8920131921768188, + "epoch": 2.22, + "learning_rate": 3.416857867407318e-05, + "loss": 0.8062, + "step": 2622, + "task_loss": 1.406044363975525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46550729870796204, + "epoch": 2.22, + "learning_rate": 3.4162540755947354e-05, + "loss": 1.1189, + "step": 2623, + "task_loss": 0.4624236524105072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0660269260406494, + "epoch": 2.22, + "learning_rate": 3.415650283782152e-05, + "loss": 1.3048, + "step": 2624, + "task_loss": 0.7277117371559143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4068208932876587, + "epoch": 2.22, + "learning_rate": 3.415046491969569e-05, + "loss": 1.4128, + "step": 2625, + "task_loss": 0.9926701188087463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7383972406387329, + "epoch": 2.22, + "learning_rate": 3.414442700156986e-05, + "loss": 1.1177, + "step": 2626, + "task_loss": 0.957274317741394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9114563465118408, + "epoch": 2.22, + "learning_rate": 3.413838908344403e-05, + "loss": 0.9853, + "step": 2627, + "task_loss": 0.6193715333938599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1046632528305054, + "epoch": 2.22, + "learning_rate": 3.4132351165318197e-05, + "loss": 0.8785, + "step": 2628, + "task_loss": 1.4196383953094482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8527463674545288, + "epoch": 2.22, + "learning_rate": 3.412631324719237e-05, + "loss": 0.7664, + "step": 2629, + "task_loss": 0.8996005058288574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7241092324256897, + "epoch": 2.22, + "learning_rate": 3.4120275329066544e-05, + "loss": 0.7781, + "step": 2630, + "task_loss": 0.35440385341644287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0816233158111572, + "epoch": 2.22, + "learning_rate": 3.4114237410940705e-05, + "loss": 0.8801, + "step": 2631, + "task_loss": 0.6183938384056091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0265636444091797, + "epoch": 2.22, + "learning_rate": 3.410819949281488e-05, + "loss": 1.0162, + "step": 2632, + "task_loss": 0.30466100573539734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7011849880218506, + "epoch": 2.23, + "learning_rate": 3.410216157468905e-05, + "loss": 1.0393, + "step": 2633, + "task_loss": 0.8757601380348206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9026620388031006, + "epoch": 2.23, + "learning_rate": 3.409612365656322e-05, + "loss": 0.9853, + "step": 2634, + "task_loss": 0.903777539730072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0438021421432495, + "epoch": 2.23, + "learning_rate": 3.409008573843739e-05, + "loss": 1.002, + "step": 2635, + "task_loss": 0.6546807885169983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28651177883148193, + "epoch": 2.23, + "learning_rate": 3.408404782031156e-05, + "loss": 0.7028, + "step": 2636, + "task_loss": 0.10968948900699615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8486788272857666, + "epoch": 2.23, + "learning_rate": 3.407800990218573e-05, + "loss": 0.9936, + "step": 2637, + "task_loss": 0.23335209488868713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44586181640625, + "epoch": 2.23, + "learning_rate": 3.4071971984059895e-05, + "loss": 0.8871, + "step": 2638, + "task_loss": 0.10956268757581711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.157613754272461, + "epoch": 2.23, + "learning_rate": 3.406593406593407e-05, + "loss": 1.0427, + "step": 2639, + "task_loss": 0.6963222622871399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.972432017326355, + "epoch": 2.23, + "learning_rate": 3.405989614780824e-05, + "loss": 0.9158, + "step": 2640, + "task_loss": 1.4774059057235718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.70735764503479, + "epoch": 2.23, + "learning_rate": 3.4053858229682404e-05, + "loss": 1.0131, + "step": 2641, + "task_loss": 2.0583670139312744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5798364877700806, + "epoch": 2.23, + "learning_rate": 3.404782031155658e-05, + "loss": 0.997, + "step": 2642, + "task_loss": 1.302799105644226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9422860741615295, + "epoch": 2.23, + "learning_rate": 3.404178239343075e-05, + "loss": 1.043, + "step": 2643, + "task_loss": 1.3856111764907837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6854699850082397, + "epoch": 2.23, + "learning_rate": 3.403574447530491e-05, + "loss": 1.0496, + "step": 2644, + "task_loss": 0.6804392337799072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8499003648757935, + "epoch": 2.24, + "learning_rate": 3.4029706557179086e-05, + "loss": 1.0954, + "step": 2645, + "task_loss": 1.5785785913467407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7805129885673523, + "epoch": 2.24, + "learning_rate": 3.402366863905326e-05, + "loss": 1.1403, + "step": 2646, + "task_loss": 0.5567512512207031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8451440334320068, + "epoch": 2.24, + "learning_rate": 3.401763072092742e-05, + "loss": 0.9345, + "step": 2647, + "task_loss": 0.22002291679382324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7402001619338989, + "epoch": 2.24, + "learning_rate": 3.4011592802801594e-05, + "loss": 0.713, + "step": 2648, + "task_loss": 0.6807963848114014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8235834240913391, + "epoch": 2.24, + "learning_rate": 3.400555488467577e-05, + "loss": 0.8971, + "step": 2649, + "task_loss": 0.7971684336662292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3212652206420898, + "epoch": 2.24, + "learning_rate": 3.3999516966549936e-05, + "loss": 0.9333, + "step": 2650, + "task_loss": 1.540669560432434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8050234913825989, + "epoch": 2.24, + "learning_rate": 3.39934790484241e-05, + "loss": 1.0631, + "step": 2651, + "task_loss": 0.7079522013664246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8887056708335876, + "epoch": 2.24, + "learning_rate": 3.398744113029828e-05, + "loss": 0.9981, + "step": 2652, + "task_loss": 0.6643526554107666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4707534909248352, + "epoch": 2.24, + "learning_rate": 3.3981403212172444e-05, + "loss": 0.7686, + "step": 2653, + "task_loss": 1.320693850517273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7952332496643066, + "epoch": 2.24, + "learning_rate": 3.397536529404661e-05, + "loss": 0.8382, + "step": 2654, + "task_loss": 1.5837558507919312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7599750757217407, + "epoch": 2.24, + "learning_rate": 3.3969327375920785e-05, + "loss": 0.8924, + "step": 2655, + "task_loss": 0.8242706060409546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9235665798187256, + "epoch": 2.24, + "learning_rate": 3.396328945779496e-05, + "loss": 1.1686, + "step": 2656, + "task_loss": 1.292905330657959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9184669852256775, + "epoch": 2.25, + "learning_rate": 3.395725153966912e-05, + "loss": 1.2078, + "step": 2657, + "task_loss": 1.4248089790344238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9695634245872498, + "epoch": 2.25, + "learning_rate": 3.395121362154329e-05, + "loss": 1.0313, + "step": 2658, + "task_loss": 2.0371057987213135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.575175166130066, + "epoch": 2.25, + "learning_rate": 3.394517570341747e-05, + "loss": 1.1826, + "step": 2659, + "task_loss": 1.4106309413909912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1319276094436646, + "epoch": 2.25, + "learning_rate": 3.393913778529163e-05, + "loss": 0.9901, + "step": 2660, + "task_loss": 0.6710088849067688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8029345273971558, + "epoch": 2.25, + "learning_rate": 3.39330998671658e-05, + "loss": 0.8615, + "step": 2661, + "task_loss": 0.3325652480125427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2347334623336792, + "epoch": 2.25, + "learning_rate": 3.3927061949039976e-05, + "loss": 0.9499, + "step": 2662, + "task_loss": 1.2764862775802612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1245617866516113, + "epoch": 2.25, + "learning_rate": 3.392102403091414e-05, + "loss": 1.0519, + "step": 2663, + "task_loss": 1.120246171951294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9194535613059998, + "epoch": 2.25, + "learning_rate": 3.391498611278831e-05, + "loss": 0.7084, + "step": 2664, + "task_loss": 1.0923113822937012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.976528525352478, + "epoch": 2.25, + "learning_rate": 3.3908948194662484e-05, + "loss": 0.9105, + "step": 2665, + "task_loss": 0.9809689521789551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5307762622833252, + "epoch": 2.25, + "learning_rate": 3.390291027653665e-05, + "loss": 1.1235, + "step": 2666, + "task_loss": 1.7642765045166016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9152707457542419, + "epoch": 2.25, + "learning_rate": 3.389687235841082e-05, + "loss": 1.4282, + "step": 2667, + "task_loss": 0.6006597876548767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5972282886505127, + "epoch": 2.26, + "learning_rate": 3.389083444028499e-05, + "loss": 1.1225, + "step": 2668, + "task_loss": 1.4307271242141724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7676655054092407, + "epoch": 2.26, + "learning_rate": 3.388479652215916e-05, + "loss": 1.1761, + "step": 2669, + "task_loss": 1.8560175895690918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5895076394081116, + "epoch": 2.26, + "learning_rate": 3.387875860403333e-05, + "loss": 0.7095, + "step": 2670, + "task_loss": 0.07226578891277313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4113216400146484, + "epoch": 2.26, + "learning_rate": 3.38727206859075e-05, + "loss": 1.0912, + "step": 2671, + "task_loss": 1.507526159286499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5729827880859375, + "epoch": 2.26, + "learning_rate": 3.386668276778167e-05, + "loss": 0.9892, + "step": 2672, + "task_loss": 1.5793005228042603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0332493782043457, + "epoch": 2.26, + "learning_rate": 3.386064484965584e-05, + "loss": 0.857, + "step": 2673, + "task_loss": 0.732329249382019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0338621139526367, + "epoch": 2.26, + "learning_rate": 3.385460693153001e-05, + "loss": 0.8643, + "step": 2674, + "task_loss": 1.4471789598464966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.191853642463684, + "epoch": 2.26, + "learning_rate": 3.384856901340418e-05, + "loss": 0.8097, + "step": 2675, + "task_loss": 0.34134289622306824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8285110592842102, + "epoch": 2.26, + "learning_rate": 3.384253109527835e-05, + "loss": 1.0557, + "step": 2676, + "task_loss": 0.5440914034843445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8842818140983582, + "epoch": 2.26, + "learning_rate": 3.383649317715252e-05, + "loss": 0.9394, + "step": 2677, + "task_loss": 0.3601643443107605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6541275978088379, + "epoch": 2.26, + "learning_rate": 3.383045525902669e-05, + "loss": 0.9225, + "step": 2678, + "task_loss": 0.4360131621360779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.724791407585144, + "epoch": 2.26, + "learning_rate": 3.382441734090086e-05, + "loss": 0.9809, + "step": 2679, + "task_loss": 0.12596459686756134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.932608962059021, + "epoch": 2.27, + "learning_rate": 3.3818379422775026e-05, + "loss": 1.0507, + "step": 2680, + "task_loss": 0.6689938306808472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.817481279373169, + "epoch": 2.27, + "learning_rate": 3.38123415046492e-05, + "loss": 0.8276, + "step": 2681, + "task_loss": 0.7658313512802124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7116538882255554, + "epoch": 2.27, + "learning_rate": 3.380630358652337e-05, + "loss": 1.1844, + "step": 2682, + "task_loss": 0.3444148898124695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6643540859222412, + "epoch": 2.27, + "learning_rate": 3.380026566839754e-05, + "loss": 0.8891, + "step": 2683, + "task_loss": 0.4557141065597534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5177115201950073, + "epoch": 2.27, + "learning_rate": 3.379422775027171e-05, + "loss": 0.9998, + "step": 2684, + "task_loss": 0.6134933829307556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6955198049545288, + "epoch": 2.27, + "learning_rate": 3.3788189832145875e-05, + "loss": 0.7871, + "step": 2685, + "task_loss": 1.3603031635284424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0159592628479004, + "epoch": 2.27, + "learning_rate": 3.378215191402005e-05, + "loss": 0.8162, + "step": 2686, + "task_loss": 0.5143377780914307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2816896438598633, + "epoch": 2.27, + "learning_rate": 3.3776113995894216e-05, + "loss": 1.0803, + "step": 2687, + "task_loss": 0.5969460606575012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0231655836105347, + "epoch": 2.27, + "learning_rate": 3.377007607776838e-05, + "loss": 1.2357, + "step": 2688, + "task_loss": 0.5532245635986328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1717865467071533, + "epoch": 2.27, + "learning_rate": 3.376403815964256e-05, + "loss": 0.9498, + "step": 2689, + "task_loss": 1.2657361030578613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0489566326141357, + "epoch": 2.27, + "learning_rate": 3.3758000241516725e-05, + "loss": 0.9028, + "step": 2690, + "task_loss": 1.637095332145691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8747189044952393, + "epoch": 2.27, + "learning_rate": 3.37519623233909e-05, + "loss": 0.8691, + "step": 2691, + "task_loss": 1.1510212421417236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7408261299133301, + "epoch": 2.28, + "learning_rate": 3.3745924405265066e-05, + "loss": 0.8189, + "step": 2692, + "task_loss": 0.6442035436630249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6757903099060059, + "epoch": 2.28, + "learning_rate": 3.373988648713924e-05, + "loss": 0.9574, + "step": 2693, + "task_loss": 0.5955031514167786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1754645109176636, + "epoch": 2.28, + "learning_rate": 3.373384856901341e-05, + "loss": 0.9679, + "step": 2694, + "task_loss": 1.7746039628982544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6735185384750366, + "epoch": 2.28, + "learning_rate": 3.3727810650887574e-05, + "loss": 0.8251, + "step": 2695, + "task_loss": 1.2289445400238037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.622434139251709, + "epoch": 2.28, + "learning_rate": 3.372177273276175e-05, + "loss": 0.6994, + "step": 2696, + "task_loss": 1.0057252645492554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6504582166671753, + "epoch": 2.28, + "learning_rate": 3.3715734814635915e-05, + "loss": 0.8908, + "step": 2697, + "task_loss": 0.4509908854961395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7204890251159668, + "epoch": 2.28, + "learning_rate": 3.370969689651008e-05, + "loss": 0.8769, + "step": 2698, + "task_loss": 1.3233318328857422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6269570589065552, + "epoch": 2.28, + "learning_rate": 3.3703658978384256e-05, + "loss": 1.1712, + "step": 2699, + "task_loss": 1.956056833267212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8725727200508118, + "epoch": 2.28, + "learning_rate": 3.3697621060258423e-05, + "loss": 0.8537, + "step": 2700, + "task_loss": 0.8371046781539917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5871586799621582, + "epoch": 2.28, + "learning_rate": 3.369158314213259e-05, + "loss": 0.7538, + "step": 2701, + "task_loss": 1.3855600357055664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7420125603675842, + "epoch": 2.28, + "learning_rate": 3.3685545224006765e-05, + "loss": 0.7936, + "step": 2702, + "task_loss": 0.22515350580215454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0388225317001343, + "epoch": 2.28, + "learning_rate": 3.367950730588094e-05, + "loss": 0.854, + "step": 2703, + "task_loss": 0.8338340520858765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2776868343353271, + "epoch": 2.29, + "learning_rate": 3.36734693877551e-05, + "loss": 1.1214, + "step": 2704, + "task_loss": 1.6590330600738525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0027110576629639, + "epoch": 2.29, + "learning_rate": 3.366743146962927e-05, + "loss": 1.2852, + "step": 2705, + "task_loss": 1.6982241868972778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6925567388534546, + "epoch": 2.29, + "learning_rate": 3.366139355150345e-05, + "loss": 0.9791, + "step": 2706, + "task_loss": 1.2016539573669434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9200738668441772, + "epoch": 2.29, + "learning_rate": 3.3655355633377614e-05, + "loss": 1.022, + "step": 2707, + "task_loss": 0.621199905872345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8233129978179932, + "epoch": 2.29, + "learning_rate": 3.364931771525178e-05, + "loss": 0.7675, + "step": 2708, + "task_loss": 1.717698574066162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.7800097465515137, + "epoch": 2.29, + "learning_rate": 3.3643279797125955e-05, + "loss": 1.0293, + "step": 2709, + "task_loss": 1.6631591320037842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.835180401802063, + "epoch": 2.29, + "learning_rate": 3.363724187900012e-05, + "loss": 0.8136, + "step": 2710, + "task_loss": 0.8154518604278564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4944051504135132, + "epoch": 2.29, + "learning_rate": 3.363120396087429e-05, + "loss": 1.2864, + "step": 2711, + "task_loss": 0.22400406002998352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6651196479797363, + "epoch": 2.29, + "learning_rate": 3.3625166042748464e-05, + "loss": 0.8696, + "step": 2712, + "task_loss": 0.876169741153717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0212856531143188, + "epoch": 2.29, + "learning_rate": 3.361912812462264e-05, + "loss": 0.861, + "step": 2713, + "task_loss": 0.5062984824180603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8903442621231079, + "epoch": 2.29, + "learning_rate": 3.36130902064968e-05, + "loss": 1.0297, + "step": 2714, + "task_loss": 0.9009994864463806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.918868899345398, + "epoch": 2.29, + "learning_rate": 3.360705228837097e-05, + "loss": 1.0413, + "step": 2715, + "task_loss": 1.5730019807815552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.239820122718811, + "epoch": 2.3, + "learning_rate": 3.3601014370245146e-05, + "loss": 1.0922, + "step": 2716, + "task_loss": 2.085752010345459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4795341491699219, + "epoch": 2.3, + "learning_rate": 3.3594976452119306e-05, + "loss": 0.6926, + "step": 2717, + "task_loss": 0.9183290004730225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8154768943786621, + "epoch": 2.3, + "learning_rate": 3.358893853399348e-05, + "loss": 0.7314, + "step": 2718, + "task_loss": 1.0868339538574219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8981808423995972, + "epoch": 2.3, + "learning_rate": 3.3582900615867654e-05, + "loss": 1.0231, + "step": 2719, + "task_loss": 1.1124112606048584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8533185124397278, + "epoch": 2.3, + "learning_rate": 3.3576862697741815e-05, + "loss": 0.9899, + "step": 2720, + "task_loss": 1.3460566997528076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.029684066772461, + "epoch": 2.3, + "learning_rate": 3.357082477961599e-05, + "loss": 0.9522, + "step": 2721, + "task_loss": 0.6942339539527893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.175058364868164, + "epoch": 2.3, + "learning_rate": 3.356478686149016e-05, + "loss": 0.7325, + "step": 2722, + "task_loss": 0.821764349937439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.682202935218811, + "epoch": 2.3, + "learning_rate": 3.355874894336433e-05, + "loss": 1.0157, + "step": 2723, + "task_loss": 0.6960113644599915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0707149505615234, + "epoch": 2.3, + "learning_rate": 3.35527110252385e-05, + "loss": 0.8266, + "step": 2724, + "task_loss": 0.5615648627281189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7703637480735779, + "epoch": 2.3, + "learning_rate": 3.354667310711267e-05, + "loss": 0.8777, + "step": 2725, + "task_loss": 0.538072407245636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5724782943725586, + "epoch": 2.3, + "learning_rate": 3.354063518898684e-05, + "loss": 0.9046, + "step": 2726, + "task_loss": 0.3673652410507202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7646126747131348, + "epoch": 2.3, + "learning_rate": 3.3534597270861005e-05, + "loss": 0.7657, + "step": 2727, + "task_loss": 0.7637287378311157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.918021023273468, + "epoch": 2.31, + "learning_rate": 3.352855935273518e-05, + "loss": 0.9972, + "step": 2728, + "task_loss": 0.6936092972755432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.450455665588379, + "epoch": 2.31, + "learning_rate": 3.352252143460935e-05, + "loss": 1.0729, + "step": 2729, + "task_loss": 2.500781774520874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9366022944450378, + "epoch": 2.31, + "learning_rate": 3.3516483516483513e-05, + "loss": 0.9498, + "step": 2730, + "task_loss": 0.4676834046840668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2773841321468353, + "epoch": 2.31, + "learning_rate": 3.351044559835769e-05, + "loss": 0.7392, + "step": 2731, + "task_loss": 0.24799323081970215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0175578594207764, + "epoch": 2.31, + "learning_rate": 3.350440768023186e-05, + "loss": 0.962, + "step": 2732, + "task_loss": 0.7949071526527405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0977458953857422, + "epoch": 2.31, + "learning_rate": 3.349836976210603e-05, + "loss": 0.9718, + "step": 2733, + "task_loss": 1.2721599340438843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6375280022621155, + "epoch": 2.31, + "learning_rate": 3.3492331843980196e-05, + "loss": 0.7857, + "step": 2734, + "task_loss": 0.58403080701828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1336392164230347, + "epoch": 2.31, + "learning_rate": 3.348629392585437e-05, + "loss": 1.2442, + "step": 2735, + "task_loss": 0.5983372330665588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6882148385047913, + "epoch": 2.31, + "learning_rate": 3.348025600772854e-05, + "loss": 0.9241, + "step": 2736, + "task_loss": 0.48378580808639526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6819981336593628, + "epoch": 2.31, + "learning_rate": 3.3474218089602704e-05, + "loss": 0.7555, + "step": 2737, + "task_loss": 1.1934655904769897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3990743160247803, + "epoch": 2.31, + "learning_rate": 3.346818017147688e-05, + "loss": 1.0298, + "step": 2738, + "task_loss": 1.1877245903015137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5990388989448547, + "epoch": 2.32, + "learning_rate": 3.3462142253351045e-05, + "loss": 0.7776, + "step": 2739, + "task_loss": 0.5888925790786743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8682611584663391, + "epoch": 2.32, + "learning_rate": 3.345610433522521e-05, + "loss": 0.9617, + "step": 2740, + "task_loss": 1.3794783353805542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9116361141204834, + "epoch": 2.32, + "learning_rate": 3.3450066417099386e-05, + "loss": 1.3678, + "step": 2741, + "task_loss": 1.0557504892349243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0719447135925293, + "epoch": 2.32, + "learning_rate": 3.3444028498973554e-05, + "loss": 1.0208, + "step": 2742, + "task_loss": 1.0600581169128418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6759663820266724, + "epoch": 2.32, + "learning_rate": 3.343799058084773e-05, + "loss": 0.7183, + "step": 2743, + "task_loss": 2.053166151046753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7869624495506287, + "epoch": 2.32, + "learning_rate": 3.3431952662721895e-05, + "loss": 0.803, + "step": 2744, + "task_loss": 0.5384209156036377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2309107780456543, + "epoch": 2.32, + "learning_rate": 3.342591474459606e-05, + "loss": 0.9157, + "step": 2745, + "task_loss": 0.7178307771682739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7603623867034912, + "epoch": 2.32, + "learning_rate": 3.3419876826470236e-05, + "loss": 0.7719, + "step": 2746, + "task_loss": 0.5239214301109314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9308511018753052, + "epoch": 2.32, + "learning_rate": 3.34138389083444e-05, + "loss": 1.0092, + "step": 2747, + "task_loss": 0.89130038022995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6566071510314941, + "epoch": 2.32, + "learning_rate": 3.340780099021858e-05, + "loss": 0.884, + "step": 2748, + "task_loss": 0.9415106773376465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1403889656066895, + "epoch": 2.32, + "learning_rate": 3.3401763072092744e-05, + "loss": 0.7545, + "step": 2749, + "task_loss": 1.6960960626602173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7936314940452576, + "epoch": 2.32, + "learning_rate": 3.339572515396691e-05, + "loss": 0.8289, + "step": 2750, + "task_loss": 0.9571467638015747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8271783590316772, + "epoch": 2.33, + "learning_rate": 3.3389687235841085e-05, + "loss": 0.9586, + "step": 2751, + "task_loss": 1.3062400817871094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7586166858673096, + "epoch": 2.33, + "learning_rate": 3.338364931771525e-05, + "loss": 1.1026, + "step": 2752, + "task_loss": 1.0915030241012573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32363593578338623, + "epoch": 2.33, + "learning_rate": 3.3377611399589426e-05, + "loss": 0.6981, + "step": 2753, + "task_loss": 0.5771260857582092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0949854850769043, + "epoch": 2.33, + "learning_rate": 3.3371573481463594e-05, + "loss": 1.0975, + "step": 2754, + "task_loss": 1.0440804958343506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2271556854248047, + "epoch": 2.33, + "learning_rate": 3.336553556333776e-05, + "loss": 1.1242, + "step": 2755, + "task_loss": 1.2121460437774658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7287555932998657, + "epoch": 2.33, + "learning_rate": 3.3359497645211935e-05, + "loss": 0.9124, + "step": 2756, + "task_loss": 0.6143949627876282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1714069843292236, + "epoch": 2.33, + "learning_rate": 3.33534597270861e-05, + "loss": 1.0478, + "step": 2757, + "task_loss": 1.3940870761871338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5877744555473328, + "epoch": 2.33, + "learning_rate": 3.334742180896027e-05, + "loss": 0.8255, + "step": 2758, + "task_loss": 0.4478219151496887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2909646034240723, + "epoch": 2.33, + "learning_rate": 3.334138389083444e-05, + "loss": 1.0216, + "step": 2759, + "task_loss": 1.5894781351089478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8579210638999939, + "epoch": 2.33, + "learning_rate": 3.333534597270861e-05, + "loss": 0.923, + "step": 2760, + "task_loss": 1.1583107709884644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9678686857223511, + "epoch": 2.33, + "learning_rate": 3.332930805458278e-05, + "loss": 1.0387, + "step": 2761, + "task_loss": 0.7029757499694824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2998355627059937, + "epoch": 2.33, + "learning_rate": 3.332327013645695e-05, + "loss": 1.0126, + "step": 2762, + "task_loss": 1.3968712091445923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.929969310760498, + "epoch": 2.34, + "learning_rate": 3.3317232218331125e-05, + "loss": 0.8186, + "step": 2763, + "task_loss": 1.5328550338745117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.581977367401123, + "epoch": 2.34, + "learning_rate": 3.331119430020529e-05, + "loss": 0.6632, + "step": 2764, + "task_loss": 0.7838281989097595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9163552522659302, + "epoch": 2.34, + "learning_rate": 3.330515638207946e-05, + "loss": 0.7615, + "step": 2765, + "task_loss": 0.7887030243873596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6203932762145996, + "epoch": 2.34, + "learning_rate": 3.3299118463953634e-05, + "loss": 0.7808, + "step": 2766, + "task_loss": 1.2968274354934692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.635939359664917, + "epoch": 2.34, + "learning_rate": 3.32930805458278e-05, + "loss": 0.8846, + "step": 2767, + "task_loss": 0.4422922134399414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8960785865783691, + "epoch": 2.34, + "learning_rate": 3.328704262770197e-05, + "loss": 1.0047, + "step": 2768, + "task_loss": 0.4507089853286743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5274184942245483, + "epoch": 2.34, + "learning_rate": 3.328100470957614e-05, + "loss": 0.8729, + "step": 2769, + "task_loss": 0.44616588950157166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0400034189224243, + "epoch": 2.34, + "learning_rate": 3.327496679145031e-05, + "loss": 0.7517, + "step": 2770, + "task_loss": 0.9686523079872131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0427360534667969, + "epoch": 2.34, + "learning_rate": 3.3268928873324476e-05, + "loss": 0.9794, + "step": 2771, + "task_loss": 0.4361608028411865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0210022926330566, + "epoch": 2.34, + "learning_rate": 3.326289095519865e-05, + "loss": 0.9857, + "step": 2772, + "task_loss": 1.7914854288101196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4057784080505371, + "epoch": 2.34, + "learning_rate": 3.3256853037072824e-05, + "loss": 0.8616, + "step": 2773, + "task_loss": 0.41765663027763367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8480380773544312, + "epoch": 2.34, + "learning_rate": 3.3250815118946985e-05, + "loss": 0.8342, + "step": 2774, + "task_loss": 0.8104041814804077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9782816171646118, + "epoch": 2.35, + "learning_rate": 3.324477720082116e-05, + "loss": 0.9393, + "step": 2775, + "task_loss": 0.8064548969268799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6400340795516968, + "epoch": 2.35, + "learning_rate": 3.323873928269533e-05, + "loss": 0.9706, + "step": 2776, + "task_loss": 0.8559364080429077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6473779678344727, + "epoch": 2.35, + "learning_rate": 3.323270136456949e-05, + "loss": 0.6339, + "step": 2777, + "task_loss": 0.7535467147827148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8532047867774963, + "epoch": 2.35, + "learning_rate": 3.322666344644367e-05, + "loss": 0.675, + "step": 2778, + "task_loss": 0.46080926060676575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.70982825756073, + "epoch": 2.35, + "learning_rate": 3.322062552831784e-05, + "loss": 0.8629, + "step": 2779, + "task_loss": 1.0623927116394043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5893363952636719, + "epoch": 2.35, + "learning_rate": 3.321458761019201e-05, + "loss": 0.9807, + "step": 2780, + "task_loss": 0.40249910950660706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8153378367424011, + "epoch": 2.35, + "learning_rate": 3.3208549692066175e-05, + "loss": 0.8976, + "step": 2781, + "task_loss": 1.7063082456588745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9056700468063354, + "epoch": 2.35, + "learning_rate": 3.320251177394035e-05, + "loss": 0.9575, + "step": 2782, + "task_loss": 0.9640542268753052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1873252391815186, + "epoch": 2.35, + "learning_rate": 3.3196473855814516e-05, + "loss": 0.9412, + "step": 2783, + "task_loss": 1.4302915334701538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0045500993728638, + "epoch": 2.35, + "learning_rate": 3.3190435937688684e-05, + "loss": 0.7086, + "step": 2784, + "task_loss": 1.048970341682434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2279248237609863, + "epoch": 2.35, + "learning_rate": 3.318439801956286e-05, + "loss": 1.2548, + "step": 2785, + "task_loss": 0.9453883171081543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2117135524749756, + "epoch": 2.35, + "learning_rate": 3.317836010143703e-05, + "loss": 1.3307, + "step": 2786, + "task_loss": 1.372259259223938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8918375968933105, + "epoch": 2.36, + "learning_rate": 3.317232218331119e-05, + "loss": 1.0388, + "step": 2787, + "task_loss": 1.1482336521148682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9327211380004883, + "epoch": 2.36, + "learning_rate": 3.3166284265185366e-05, + "loss": 0.796, + "step": 2788, + "task_loss": 1.1190260648727417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.742501974105835, + "epoch": 2.36, + "learning_rate": 3.316024634705954e-05, + "loss": 1.0226, + "step": 2789, + "task_loss": 0.8503888845443726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5036033391952515, + "epoch": 2.36, + "learning_rate": 3.31542084289337e-05, + "loss": 0.9159, + "step": 2790, + "task_loss": 0.5288277268409729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7272175550460815, + "epoch": 2.36, + "learning_rate": 3.3148170510807874e-05, + "loss": 0.9801, + "step": 2791, + "task_loss": 0.8269513845443726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.867139458656311, + "epoch": 2.36, + "learning_rate": 3.314213259268205e-05, + "loss": 0.9429, + "step": 2792, + "task_loss": 0.5630469918251038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3334500789642334, + "epoch": 2.36, + "learning_rate": 3.3136094674556215e-05, + "loss": 0.9502, + "step": 2793, + "task_loss": 1.378792643547058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7074646949768066, + "epoch": 2.36, + "learning_rate": 3.313005675643038e-05, + "loss": 1.0753, + "step": 2794, + "task_loss": 0.49129319190979004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5991054773330688, + "epoch": 2.36, + "learning_rate": 3.3124018838304557e-05, + "loss": 0.9418, + "step": 2795, + "task_loss": 0.47627317905426025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4860683679580688, + "epoch": 2.36, + "learning_rate": 3.3117980920178724e-05, + "loss": 1.1168, + "step": 2796, + "task_loss": 1.0081244707107544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8480997085571289, + "epoch": 2.36, + "learning_rate": 3.311194300205289e-05, + "loss": 0.8715, + "step": 2797, + "task_loss": 1.1260061264038086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2352893352508545, + "epoch": 2.36, + "learning_rate": 3.3105905083927065e-05, + "loss": 0.8859, + "step": 2798, + "task_loss": 0.44881337881088257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6530822515487671, + "epoch": 2.37, + "learning_rate": 3.309986716580123e-05, + "loss": 0.8176, + "step": 2799, + "task_loss": 1.0597397089004517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2304600477218628, + "epoch": 2.37, + "learning_rate": 3.30938292476754e-05, + "loss": 1.0072, + "step": 2800, + "task_loss": 1.1774234771728516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8958079814910889, + "epoch": 2.37, + "learning_rate": 3.308779132954957e-05, + "loss": 0.7998, + "step": 2801, + "task_loss": 0.8911529183387756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1642166376113892, + "epoch": 2.37, + "learning_rate": 3.308175341142374e-05, + "loss": 0.8822, + "step": 2802, + "task_loss": 1.0235081911087036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.834490180015564, + "epoch": 2.37, + "learning_rate": 3.3075715493297914e-05, + "loss": 0.8435, + "step": 2803, + "task_loss": 0.9120036363601685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1348217725753784, + "epoch": 2.37, + "learning_rate": 3.306967757517208e-05, + "loss": 0.8852, + "step": 2804, + "task_loss": 1.1849141120910645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8975103497505188, + "epoch": 2.37, + "learning_rate": 3.3063639657046255e-05, + "loss": 0.9898, + "step": 2805, + "task_loss": 1.3823573589324951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5769085884094238, + "epoch": 2.37, + "learning_rate": 3.305760173892042e-05, + "loss": 0.8847, + "step": 2806, + "task_loss": 0.6307712197303772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8605297207832336, + "epoch": 2.37, + "learning_rate": 3.305156382079459e-05, + "loss": 0.9355, + "step": 2807, + "task_loss": 0.8241601586341858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.180167555809021, + "epoch": 2.37, + "learning_rate": 3.3045525902668764e-05, + "loss": 0.7468, + "step": 2808, + "task_loss": 1.04692542552948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8402324318885803, + "epoch": 2.37, + "learning_rate": 3.303948798454293e-05, + "loss": 0.7038, + "step": 2809, + "task_loss": 0.6949109435081482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7122187614440918, + "epoch": 2.38, + "learning_rate": 3.30334500664171e-05, + "loss": 0.909, + "step": 2810, + "task_loss": 0.7559311985969543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7848840951919556, + "epoch": 2.38, + "learning_rate": 3.302741214829127e-05, + "loss": 0.992, + "step": 2811, + "task_loss": 0.6812652945518494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1048786640167236, + "epoch": 2.38, + "learning_rate": 3.302137423016544e-05, + "loss": 0.8967, + "step": 2812, + "task_loss": 0.9271783828735352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5336031913757324, + "epoch": 2.38, + "learning_rate": 3.301533631203961e-05, + "loss": 0.8999, + "step": 2813, + "task_loss": 1.6040239334106445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5380577445030212, + "epoch": 2.38, + "learning_rate": 3.300929839391378e-05, + "loss": 0.8099, + "step": 2814, + "task_loss": 0.14455053210258484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7060345411300659, + "epoch": 2.38, + "learning_rate": 3.300326047578795e-05, + "loss": 0.9763, + "step": 2815, + "task_loss": 0.6798547506332397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2111506462097168, + "epoch": 2.38, + "learning_rate": 3.299722255766212e-05, + "loss": 0.9161, + "step": 2816, + "task_loss": 1.1247963905334473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1909908056259155, + "epoch": 2.38, + "learning_rate": 3.299118463953629e-05, + "loss": 0.7299, + "step": 2817, + "task_loss": 1.0985205173492432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4357483386993408, + "epoch": 2.38, + "learning_rate": 3.2985146721410456e-05, + "loss": 0.5673, + "step": 2818, + "task_loss": 0.37735363841056824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5549491047859192, + "epoch": 2.38, + "learning_rate": 3.297910880328463e-05, + "loss": 0.6768, + "step": 2819, + "task_loss": 0.6331230998039246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8248299360275269, + "epoch": 2.38, + "learning_rate": 3.29730708851588e-05, + "loss": 0.6741, + "step": 2820, + "task_loss": 1.3869222402572632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.961200475692749, + "epoch": 2.38, + "learning_rate": 3.296703296703297e-05, + "loss": 0.9932, + "step": 2821, + "task_loss": 0.5817019939422607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.923088788986206, + "epoch": 2.39, + "learning_rate": 3.296099504890714e-05, + "loss": 1.0372, + "step": 2822, + "task_loss": 1.1444703340530396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6645529270172119, + "epoch": 2.39, + "learning_rate": 3.2954957130781305e-05, + "loss": 0.8437, + "step": 2823, + "task_loss": 1.094321608543396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7674187421798706, + "epoch": 2.39, + "learning_rate": 3.294891921265548e-05, + "loss": 0.8493, + "step": 2824, + "task_loss": 0.5415501594543457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5413985252380371, + "epoch": 2.39, + "learning_rate": 3.2942881294529647e-05, + "loss": 0.8957, + "step": 2825, + "task_loss": 0.619649350643158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8950704336166382, + "epoch": 2.39, + "learning_rate": 3.293684337640382e-05, + "loss": 0.8396, + "step": 2826, + "task_loss": 1.071526288986206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.696370244026184, + "epoch": 2.39, + "learning_rate": 3.293080545827799e-05, + "loss": 1.3135, + "step": 2827, + "task_loss": 1.1475722789764404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6022390723228455, + "epoch": 2.39, + "learning_rate": 3.2924767540152155e-05, + "loss": 0.7315, + "step": 2828, + "task_loss": 0.7150059342384338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.560997724533081, + "epoch": 2.39, + "learning_rate": 3.291872962202633e-05, + "loss": 0.822, + "step": 2829, + "task_loss": 1.1898075342178345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0872727632522583, + "epoch": 2.39, + "learning_rate": 3.2912691703900496e-05, + "loss": 1.0204, + "step": 2830, + "task_loss": 0.6172168850898743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.04429030418396, + "epoch": 2.39, + "learning_rate": 3.290665378577466e-05, + "loss": 0.7524, + "step": 2831, + "task_loss": 0.36200955510139465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.831575870513916, + "epoch": 2.39, + "learning_rate": 3.290061586764884e-05, + "loss": 1.0549, + "step": 2832, + "task_loss": 0.4593145251274109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6238629221916199, + "epoch": 2.39, + "learning_rate": 3.2894577949523004e-05, + "loss": 0.6351, + "step": 2833, + "task_loss": 1.0745619535446167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8361895680427551, + "epoch": 2.4, + "learning_rate": 3.288854003139717e-05, + "loss": 1.0352, + "step": 2834, + "task_loss": 0.8293092250823975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7514456510543823, + "epoch": 2.4, + "learning_rate": 3.2882502113271346e-05, + "loss": 0.8595, + "step": 2835, + "task_loss": 0.6336367130279541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1775598526000977, + "epoch": 2.4, + "learning_rate": 3.287646419514552e-05, + "loss": 0.9379, + "step": 2836, + "task_loss": 1.566910743713379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6616877913475037, + "epoch": 2.4, + "learning_rate": 3.287042627701969e-05, + "loss": 0.7718, + "step": 2837, + "task_loss": 0.7921245694160461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6777482032775879, + "epoch": 2.4, + "learning_rate": 3.2864388358893854e-05, + "loss": 0.8376, + "step": 2838, + "task_loss": 0.4684142768383026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.862629234790802, + "epoch": 2.4, + "learning_rate": 3.285835044076803e-05, + "loss": 0.7408, + "step": 2839, + "task_loss": 0.6321136951446533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7331017255783081, + "epoch": 2.4, + "learning_rate": 3.2852312522642195e-05, + "loss": 1.0422, + "step": 2840, + "task_loss": 1.3112554550170898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6967334747314453, + "epoch": 2.4, + "learning_rate": 3.284627460451636e-05, + "loss": 0.7056, + "step": 2841, + "task_loss": 1.161782145500183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8684355020523071, + "epoch": 2.4, + "learning_rate": 3.2840236686390536e-05, + "loss": 0.7655, + "step": 2842, + "task_loss": 0.7267280220985413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9047896862030029, + "epoch": 2.4, + "learning_rate": 3.28341987682647e-05, + "loss": 0.9178, + "step": 2843, + "task_loss": 0.7912650108337402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5731492042541504, + "epoch": 2.4, + "learning_rate": 3.282816085013887e-05, + "loss": 0.9212, + "step": 2844, + "task_loss": 0.3461819291114807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8881270885467529, + "epoch": 2.4, + "learning_rate": 3.2822122932013044e-05, + "loss": 0.755, + "step": 2845, + "task_loss": 0.5680453777313232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6593795418739319, + "epoch": 2.41, + "learning_rate": 3.281608501388722e-05, + "loss": 0.9017, + "step": 2846, + "task_loss": 0.3043040633201599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8972082734107971, + "epoch": 2.41, + "learning_rate": 3.281004709576138e-05, + "loss": 0.9019, + "step": 2847, + "task_loss": 0.9299072623252869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2110645771026611, + "epoch": 2.41, + "learning_rate": 3.280400917763555e-05, + "loss": 1.0781, + "step": 2848, + "task_loss": 0.7348819971084595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9715776443481445, + "epoch": 2.41, + "learning_rate": 3.279797125950973e-05, + "loss": 1.1793, + "step": 2849, + "task_loss": 0.541412889957428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2992650270462036, + "epoch": 2.41, + "learning_rate": 3.279193334138389e-05, + "loss": 0.9831, + "step": 2850, + "task_loss": 0.9367376565933228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9535198211669922, + "epoch": 2.41, + "learning_rate": 3.278589542325806e-05, + "loss": 0.8742, + "step": 2851, + "task_loss": 0.28230205178260803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7684029340744019, + "epoch": 2.41, + "learning_rate": 3.2779857505132235e-05, + "loss": 0.827, + "step": 2852, + "task_loss": 1.345329761505127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5055259466171265, + "epoch": 2.41, + "learning_rate": 3.27738195870064e-05, + "loss": 0.8198, + "step": 2853, + "task_loss": 0.12649044394493103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8105188012123108, + "epoch": 2.41, + "learning_rate": 3.276778166888057e-05, + "loss": 0.9507, + "step": 2854, + "task_loss": 0.6093190312385559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2786235809326172, + "epoch": 2.41, + "learning_rate": 3.276174375075474e-05, + "loss": 1.0195, + "step": 2855, + "task_loss": 2.1092777252197266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7351697683334351, + "epoch": 2.41, + "learning_rate": 3.275570583262891e-05, + "loss": 1.0528, + "step": 2856, + "task_loss": 0.9971580505371094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5019831657409668, + "epoch": 2.41, + "learning_rate": 3.274966791450308e-05, + "loss": 0.8468, + "step": 2857, + "task_loss": 1.1405714750289917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2786078453063965, + "epoch": 2.42, + "learning_rate": 3.274362999637725e-05, + "loss": 0.9961, + "step": 2858, + "task_loss": 1.215188980102539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0205318927764893, + "epoch": 2.42, + "learning_rate": 3.273759207825142e-05, + "loss": 0.8504, + "step": 2859, + "task_loss": 0.5503911375999451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4503839313983917, + "epoch": 2.42, + "learning_rate": 3.2731554160125586e-05, + "loss": 0.7484, + "step": 2860, + "task_loss": 0.6186447143554688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.034995436668396, + "epoch": 2.42, + "learning_rate": 3.272551624199976e-05, + "loss": 0.8673, + "step": 2861, + "task_loss": 1.4720786809921265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6631806492805481, + "epoch": 2.42, + "learning_rate": 3.2719478323873934e-05, + "loss": 0.7874, + "step": 2862, + "task_loss": 0.44145673513412476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6990792155265808, + "epoch": 2.42, + "learning_rate": 3.2713440405748094e-05, + "loss": 0.947, + "step": 2863, + "task_loss": 1.4039933681488037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6229379177093506, + "epoch": 2.42, + "learning_rate": 3.270740248762227e-05, + "loss": 0.6284, + "step": 2864, + "task_loss": 0.46257296204566956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5364512205123901, + "epoch": 2.42, + "learning_rate": 3.270136456949644e-05, + "loss": 0.8307, + "step": 2865, + "task_loss": 0.2517240345478058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0853326320648193, + "epoch": 2.42, + "learning_rate": 3.269532665137061e-05, + "loss": 0.9419, + "step": 2866, + "task_loss": 1.204682469367981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6977158784866333, + "epoch": 2.42, + "learning_rate": 3.268928873324478e-05, + "loss": 0.9476, + "step": 2867, + "task_loss": 0.3645823001861572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6663147211074829, + "epoch": 2.42, + "learning_rate": 3.268325081511895e-05, + "loss": 0.7112, + "step": 2868, + "task_loss": 0.8512425422668457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6107907295227051, + "epoch": 2.42, + "learning_rate": 3.267721289699312e-05, + "loss": 0.6773, + "step": 2869, + "task_loss": 0.41885432600975037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2693182229995728, + "epoch": 2.43, + "learning_rate": 3.2671174978867285e-05, + "loss": 0.9869, + "step": 2870, + "task_loss": 0.635145366191864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6516168117523193, + "epoch": 2.43, + "learning_rate": 3.266513706074146e-05, + "loss": 0.9784, + "step": 2871, + "task_loss": 1.0750597715377808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4344112873077393, + "epoch": 2.43, + "learning_rate": 3.2659099142615626e-05, + "loss": 0.9266, + "step": 2872, + "task_loss": 1.3012166023254395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49300891160964966, + "epoch": 2.43, + "learning_rate": 3.265306122448979e-05, + "loss": 0.6958, + "step": 2873, + "task_loss": 0.711258590221405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4768095016479492, + "epoch": 2.43, + "learning_rate": 3.264702330636397e-05, + "loss": 0.8277, + "step": 2874, + "task_loss": 0.2731589376926422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1137490272521973, + "epoch": 2.43, + "learning_rate": 3.2640985388238134e-05, + "loss": 1.0096, + "step": 2875, + "task_loss": 1.9950792789459229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5181125402450562, + "epoch": 2.43, + "learning_rate": 3.263494747011231e-05, + "loss": 0.8841, + "step": 2876, + "task_loss": 0.7141669392585754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9992743730545044, + "epoch": 2.43, + "learning_rate": 3.2628909551986476e-05, + "loss": 0.7069, + "step": 2877, + "task_loss": 0.7759259343147278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7502714395523071, + "epoch": 2.43, + "learning_rate": 3.262287163386065e-05, + "loss": 0.8964, + "step": 2878, + "task_loss": 1.2772576808929443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8539506793022156, + "epoch": 2.43, + "learning_rate": 3.261683371573482e-05, + "loss": 0.7983, + "step": 2879, + "task_loss": 0.4517708420753479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0385268926620483, + "epoch": 2.43, + "learning_rate": 3.2610795797608984e-05, + "loss": 0.8242, + "step": 2880, + "task_loss": 1.0699708461761475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9664639234542847, + "epoch": 2.44, + "learning_rate": 3.260475787948316e-05, + "loss": 0.9845, + "step": 2881, + "task_loss": 0.5256713032722473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1014221906661987, + "epoch": 2.44, + "learning_rate": 3.2598719961357325e-05, + "loss": 1.0038, + "step": 2882, + "task_loss": 0.7624884247779846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.132041573524475, + "epoch": 2.44, + "learning_rate": 3.259268204323149e-05, + "loss": 0.736, + "step": 2883, + "task_loss": 0.7164139151573181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49275892972946167, + "epoch": 2.44, + "learning_rate": 3.2586644125105666e-05, + "loss": 0.7155, + "step": 2884, + "task_loss": 0.17327173054218292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.59247624874115, + "epoch": 2.44, + "learning_rate": 3.2580606206979833e-05, + "loss": 0.9866, + "step": 2885, + "task_loss": 1.4928157329559326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7771472334861755, + "epoch": 2.44, + "learning_rate": 3.257456828885401e-05, + "loss": 1.2261, + "step": 2886, + "task_loss": 0.9209288954734802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2777198255062103, + "epoch": 2.44, + "learning_rate": 3.2568530370728175e-05, + "loss": 0.7097, + "step": 2887, + "task_loss": 0.3477313220500946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48255491256713867, + "epoch": 2.44, + "learning_rate": 3.256249245260234e-05, + "loss": 0.7836, + "step": 2888, + "task_loss": 0.5507585406303406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7601661682128906, + "epoch": 2.44, + "learning_rate": 3.2556454534476516e-05, + "loss": 0.8158, + "step": 2889, + "task_loss": 0.4794238805770874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6322106719017029, + "epoch": 2.44, + "learning_rate": 3.255041661635068e-05, + "loss": 0.755, + "step": 2890, + "task_loss": 0.31476354598999023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4842289984226227, + "epoch": 2.44, + "learning_rate": 3.254437869822485e-05, + "loss": 0.6458, + "step": 2891, + "task_loss": 0.18893949687480927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7494486570358276, + "epoch": 2.44, + "learning_rate": 3.2538340780099024e-05, + "loss": 1.0072, + "step": 2892, + "task_loss": 0.28199902176856995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.688312292098999, + "epoch": 2.45, + "learning_rate": 3.253230286197319e-05, + "loss": 0.7717, + "step": 2893, + "task_loss": 0.3075966536998749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6612006425857544, + "epoch": 2.45, + "learning_rate": 3.2526264943847365e-05, + "loss": 0.6545, + "step": 2894, + "task_loss": 0.6770530939102173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6166071891784668, + "epoch": 2.45, + "learning_rate": 3.252022702572153e-05, + "loss": 1.0283, + "step": 2895, + "task_loss": 0.7821600437164307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9795551300048828, + "epoch": 2.45, + "learning_rate": 3.2514189107595706e-05, + "loss": 0.76, + "step": 2896, + "task_loss": 0.22820217907428741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3775837421417236, + "epoch": 2.45, + "learning_rate": 3.2508151189469873e-05, + "loss": 0.9753, + "step": 2897, + "task_loss": 1.1048611402511597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7362343072891235, + "epoch": 2.45, + "learning_rate": 3.250211327134404e-05, + "loss": 1.0185, + "step": 2898, + "task_loss": 0.7415258288383484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6338307857513428, + "epoch": 2.45, + "learning_rate": 3.2496075353218215e-05, + "loss": 0.7083, + "step": 2899, + "task_loss": 0.5021026134490967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7585917711257935, + "epoch": 2.45, + "learning_rate": 3.249003743509238e-05, + "loss": 1.0034, + "step": 2900, + "task_loss": 0.4196048676967621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.132533311843872, + "epoch": 2.45, + "learning_rate": 3.248399951696655e-05, + "loss": 1.0083, + "step": 2901, + "task_loss": 1.1303118467330933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.837515652179718, + "epoch": 2.45, + "learning_rate": 3.247796159884072e-05, + "loss": 0.8018, + "step": 2902, + "task_loss": 0.6076485514640808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0906003713607788, + "epoch": 2.45, + "learning_rate": 3.247192368071489e-05, + "loss": 0.796, + "step": 2903, + "task_loss": 0.8274542689323425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7722394466400146, + "epoch": 2.45, + "learning_rate": 3.246588576258906e-05, + "loss": 1.0006, + "step": 2904, + "task_loss": 0.727177619934082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3603975772857666, + "epoch": 2.46, + "learning_rate": 3.245984784446323e-05, + "loss": 0.9392, + "step": 2905, + "task_loss": 1.7391974925994873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4708527326583862, + "epoch": 2.46, + "learning_rate": 3.2453809926337405e-05, + "loss": 1.1332, + "step": 2906, + "task_loss": 1.1995388269424438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6748366355895996, + "epoch": 2.46, + "learning_rate": 3.2447772008211566e-05, + "loss": 0.9604, + "step": 2907, + "task_loss": 1.1734504699707031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5978589057922363, + "epoch": 2.46, + "learning_rate": 3.244173409008574e-05, + "loss": 0.8322, + "step": 2908, + "task_loss": 0.5152594447135925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0617574453353882, + "epoch": 2.46, + "learning_rate": 3.2435696171959914e-05, + "loss": 1.09, + "step": 2909, + "task_loss": 1.6811085939407349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.708771824836731, + "epoch": 2.46, + "learning_rate": 3.242965825383408e-05, + "loss": 0.8687, + "step": 2910, + "task_loss": 1.0094149112701416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6269038915634155, + "epoch": 2.46, + "learning_rate": 3.242362033570825e-05, + "loss": 0.7852, + "step": 2911, + "task_loss": 0.4934190809726715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0415931940078735, + "epoch": 2.46, + "learning_rate": 3.241758241758242e-05, + "loss": 0.8899, + "step": 2912, + "task_loss": 0.5737878084182739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.421137809753418, + "epoch": 2.46, + "learning_rate": 3.241154449945659e-05, + "loss": 1.1366, + "step": 2913, + "task_loss": 0.7453801035881042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9945685863494873, + "epoch": 2.46, + "learning_rate": 3.2405506581330756e-05, + "loss": 1.0365, + "step": 2914, + "task_loss": 1.2096803188323975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9649375081062317, + "epoch": 2.46, + "learning_rate": 3.239946866320493e-05, + "loss": 0.7832, + "step": 2915, + "task_loss": 1.0220212936401367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9059720635414124, + "epoch": 2.46, + "learning_rate": 3.2393430745079104e-05, + "loss": 1.0342, + "step": 2916, + "task_loss": 1.1540971994400024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7540695071220398, + "epoch": 2.47, + "learning_rate": 3.2387392826953265e-05, + "loss": 0.9253, + "step": 2917, + "task_loss": 0.6525171995162964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8361202478408813, + "epoch": 2.47, + "learning_rate": 3.238135490882744e-05, + "loss": 0.8631, + "step": 2918, + "task_loss": 0.369408518075943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7345149517059326, + "epoch": 2.47, + "learning_rate": 3.237531699070161e-05, + "loss": 0.8672, + "step": 2919, + "task_loss": 0.14087851345539093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8696631193161011, + "epoch": 2.47, + "learning_rate": 3.236927907257577e-05, + "loss": 0.9238, + "step": 2920, + "task_loss": 0.7390062808990479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4212777614593506, + "epoch": 2.47, + "learning_rate": 3.236324115444995e-05, + "loss": 0.8344, + "step": 2921, + "task_loss": 0.5525619387626648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6521879434585571, + "epoch": 2.47, + "learning_rate": 3.235720323632412e-05, + "loss": 0.8863, + "step": 2922, + "task_loss": 0.7836416959762573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7679495811462402, + "epoch": 2.47, + "learning_rate": 3.235116531819828e-05, + "loss": 0.8288, + "step": 2923, + "task_loss": 0.7002469897270203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4390490651130676, + "epoch": 2.47, + "learning_rate": 3.2345127400072455e-05, + "loss": 0.8028, + "step": 2924, + "task_loss": 1.423789381980896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5713815093040466, + "epoch": 2.47, + "learning_rate": 3.233908948194663e-05, + "loss": 0.8462, + "step": 2925, + "task_loss": 0.6427159309387207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4166971445083618, + "epoch": 2.47, + "learning_rate": 3.2333051563820796e-05, + "loss": 0.655, + "step": 2926, + "task_loss": 1.1895174980163574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8471949100494385, + "epoch": 2.47, + "learning_rate": 3.2327013645694964e-05, + "loss": 0.834, + "step": 2927, + "task_loss": 0.794459879398346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6519914269447327, + "epoch": 2.47, + "learning_rate": 3.232097572756914e-05, + "loss": 0.7574, + "step": 2928, + "task_loss": 0.8581552505493164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5764175653457642, + "epoch": 2.48, + "learning_rate": 3.2314937809443305e-05, + "loss": 0.9341, + "step": 2929, + "task_loss": 0.5125983357429504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27868857979774475, + "epoch": 2.48, + "learning_rate": 3.230889989131747e-05, + "loss": 0.6759, + "step": 2930, + "task_loss": 0.6560837626457214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6543096303939819, + "epoch": 2.48, + "learning_rate": 3.2302861973191646e-05, + "loss": 0.6799, + "step": 2931, + "task_loss": 0.20707793533802032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.816749632358551, + "epoch": 2.48, + "learning_rate": 3.229682405506581e-05, + "loss": 0.9246, + "step": 2932, + "task_loss": 0.8930086493492126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7288960218429565, + "epoch": 2.48, + "learning_rate": 3.229078613693998e-05, + "loss": 0.7857, + "step": 2933, + "task_loss": 0.42078742384910583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6835066080093384, + "epoch": 2.48, + "learning_rate": 3.2284748218814154e-05, + "loss": 0.8932, + "step": 2934, + "task_loss": 1.129055142402649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4528479278087616, + "epoch": 2.48, + "learning_rate": 3.227871030068833e-05, + "loss": 0.6802, + "step": 2935, + "task_loss": 0.05345381051301956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.053920030593872, + "epoch": 2.48, + "learning_rate": 3.2272672382562495e-05, + "loss": 0.8972, + "step": 2936, + "task_loss": 0.4809158742427826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9291329979896545, + "epoch": 2.48, + "learning_rate": 3.226663446443666e-05, + "loss": 0.9904, + "step": 2937, + "task_loss": 0.6103377938270569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6649560332298279, + "epoch": 2.48, + "learning_rate": 3.2260596546310836e-05, + "loss": 0.7974, + "step": 2938, + "task_loss": 0.2971968650817871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2891366481781006, + "epoch": 2.48, + "learning_rate": 3.2254558628185004e-05, + "loss": 1.0864, + "step": 2939, + "task_loss": 1.5032516717910767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9032121896743774, + "epoch": 2.48, + "learning_rate": 3.224852071005917e-05, + "loss": 0.8203, + "step": 2940, + "task_loss": 0.8152036070823669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.300257682800293, + "epoch": 2.49, + "learning_rate": 3.2242482791933345e-05, + "loss": 0.9658, + "step": 2941, + "task_loss": 1.5385836362838745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1240324974060059, + "epoch": 2.49, + "learning_rate": 3.223644487380751e-05, + "loss": 0.8849, + "step": 2942, + "task_loss": 1.1655923128128052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9554330110549927, + "epoch": 2.49, + "learning_rate": 3.223040695568168e-05, + "loss": 0.9336, + "step": 2943, + "task_loss": 0.6401989459991455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8290506601333618, + "epoch": 2.49, + "learning_rate": 3.222436903755585e-05, + "loss": 0.9738, + "step": 2944, + "task_loss": 0.9150649309158325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1051589250564575, + "epoch": 2.49, + "learning_rate": 3.221833111943002e-05, + "loss": 0.8357, + "step": 2945, + "task_loss": 0.9484920501708984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5697426199913025, + "epoch": 2.49, + "learning_rate": 3.2212293201304194e-05, + "loss": 0.919, + "step": 2946, + "task_loss": 1.0301179885864258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9359353184700012, + "epoch": 2.49, + "learning_rate": 3.220625528317836e-05, + "loss": 0.9764, + "step": 2947, + "task_loss": 1.0430269241333008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7498339414596558, + "epoch": 2.49, + "learning_rate": 3.220021736505253e-05, + "loss": 0.779, + "step": 2948, + "task_loss": 0.6550315618515015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0643386840820312, + "epoch": 2.49, + "learning_rate": 3.21941794469267e-05, + "loss": 0.9679, + "step": 2949, + "task_loss": 1.3420933485031128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7117494940757751, + "epoch": 2.49, + "learning_rate": 3.218814152880087e-05, + "loss": 0.8809, + "step": 2950, + "task_loss": 0.6326349973678589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7402175664901733, + "epoch": 2.49, + "learning_rate": 3.2182103610675044e-05, + "loss": 0.7969, + "step": 2951, + "task_loss": 0.5275585651397705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5870144963264465, + "epoch": 2.5, + "learning_rate": 3.217606569254921e-05, + "loss": 0.7889, + "step": 2952, + "task_loss": 1.1345893144607544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.132422685623169, + "epoch": 2.5, + "learning_rate": 3.217002777442338e-05, + "loss": 1.0642, + "step": 2953, + "task_loss": 0.45857974886894226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6052035689353943, + "epoch": 2.5, + "learning_rate": 3.216398985629755e-05, + "loss": 0.7485, + "step": 2954, + "task_loss": 0.18246588110923767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.745585560798645, + "epoch": 2.5, + "learning_rate": 3.215795193817172e-05, + "loss": 0.8852, + "step": 2955, + "task_loss": 1.3094991445541382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9829490780830383, + "epoch": 2.5, + "learning_rate": 3.215191402004589e-05, + "loss": 1.1625, + "step": 2956, + "task_loss": 0.8204315900802612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4937748908996582, + "epoch": 2.5, + "learning_rate": 3.214587610192006e-05, + "loss": 1.0725, + "step": 2957, + "task_loss": 0.8999999761581421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6362663507461548, + "epoch": 2.5, + "learning_rate": 3.213983818379423e-05, + "loss": 0.7003, + "step": 2958, + "task_loss": 0.8550742864608765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5605192184448242, + "epoch": 2.5, + "learning_rate": 3.21338002656684e-05, + "loss": 0.5748, + "step": 2959, + "task_loss": 0.22620804607868195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6340289115905762, + "epoch": 2.5, + "learning_rate": 3.212776234754257e-05, + "loss": 0.7998, + "step": 2960, + "task_loss": 0.37425655126571655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8747180700302124, + "epoch": 2.5, + "learning_rate": 3.2121724429416736e-05, + "loss": 0.8811, + "step": 2961, + "task_loss": 0.8462705016136169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5913041830062866, + "epoch": 2.5, + "learning_rate": 3.211568651129091e-05, + "loss": 0.6872, + "step": 2962, + "task_loss": 1.6385858058929443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.849287748336792, + "epoch": 2.5, + "learning_rate": 3.210964859316508e-05, + "loss": 0.9858, + "step": 2963, + "task_loss": 0.7815146446228027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0486512184143066, + "epoch": 2.51, + "learning_rate": 3.2103610675039244e-05, + "loss": 0.7977, + "step": 2964, + "task_loss": 0.8510364294052124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6807609796524048, + "epoch": 2.51, + "learning_rate": 3.209757275691342e-05, + "loss": 0.774, + "step": 2965, + "task_loss": 0.4950663149356842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4242117702960968, + "epoch": 2.51, + "learning_rate": 3.209153483878759e-05, + "loss": 0.5973, + "step": 2966, + "task_loss": 0.6834632754325867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39475733041763306, + "epoch": 2.51, + "learning_rate": 3.208549692066176e-05, + "loss": 0.6665, + "step": 2967, + "task_loss": 0.7661213278770447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8595190644264221, + "epoch": 2.51, + "learning_rate": 3.2079459002535926e-05, + "loss": 0.8313, + "step": 2968, + "task_loss": 0.6501460075378418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8260771036148071, + "epoch": 2.51, + "learning_rate": 3.20734210844101e-05, + "loss": 0.789, + "step": 2969, + "task_loss": 0.9016792178153992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7676417231559753, + "epoch": 2.51, + "learning_rate": 3.206738316628427e-05, + "loss": 0.7597, + "step": 2970, + "task_loss": 1.440747857093811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5636439919471741, + "epoch": 2.51, + "learning_rate": 3.2061345248158435e-05, + "loss": 0.7299, + "step": 2971, + "task_loss": 0.2645760178565979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9017335176467896, + "epoch": 2.51, + "learning_rate": 3.205530733003261e-05, + "loss": 0.7208, + "step": 2972, + "task_loss": 0.6191311478614807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7303603291511536, + "epoch": 2.51, + "learning_rate": 3.2049269411906776e-05, + "loss": 1.0316, + "step": 2973, + "task_loss": 1.1697274446487427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2573659420013428, + "epoch": 2.51, + "learning_rate": 3.204323149378094e-05, + "loss": 0.8891, + "step": 2974, + "task_loss": 0.6063340902328491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2797832489013672, + "epoch": 2.51, + "learning_rate": 3.203719357565512e-05, + "loss": 1.065, + "step": 2975, + "task_loss": 1.8085486888885498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0743029117584229, + "epoch": 2.52, + "learning_rate": 3.203115565752929e-05, + "loss": 0.9763, + "step": 2976, + "task_loss": 1.7480192184448242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.512091338634491, + "epoch": 2.52, + "learning_rate": 3.202511773940345e-05, + "loss": 0.7786, + "step": 2977, + "task_loss": 0.24623432755470276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.981354832649231, + "epoch": 2.52, + "learning_rate": 3.2019079821277625e-05, + "loss": 0.7434, + "step": 2978, + "task_loss": 0.6764379143714905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.519895315170288, + "epoch": 2.52, + "learning_rate": 3.20130419031518e-05, + "loss": 0.9868, + "step": 2979, + "task_loss": 0.7029870748519897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.362274408340454, + "epoch": 2.52, + "learning_rate": 3.200700398502596e-05, + "loss": 0.8779, + "step": 2980, + "task_loss": 1.2854820489883423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0162653923034668, + "epoch": 2.52, + "learning_rate": 3.2000966066900134e-05, + "loss": 0.9447, + "step": 2981, + "task_loss": 0.4324296712875366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.697432279586792, + "epoch": 2.52, + "learning_rate": 3.199492814877431e-05, + "loss": 0.8042, + "step": 2982, + "task_loss": 0.09608100354671478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0489168167114258, + "epoch": 2.52, + "learning_rate": 3.1988890230648475e-05, + "loss": 0.9978, + "step": 2983, + "task_loss": 0.8962480425834656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8684440851211548, + "epoch": 2.52, + "learning_rate": 3.198285231252264e-05, + "loss": 0.8753, + "step": 2984, + "task_loss": 0.7597934603691101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9698823690414429, + "epoch": 2.52, + "learning_rate": 3.1976814394396816e-05, + "loss": 0.7752, + "step": 2985, + "task_loss": 0.43564245104789734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7125217914581299, + "epoch": 2.52, + "learning_rate": 3.197077647627098e-05, + "loss": 0.5691, + "step": 2986, + "task_loss": 0.5527481436729431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0032899379730225, + "epoch": 2.52, + "learning_rate": 3.196473855814515e-05, + "loss": 0.86, + "step": 2987, + "task_loss": 1.0519012212753296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5853729248046875, + "epoch": 2.53, + "learning_rate": 3.1958700640019324e-05, + "loss": 0.6881, + "step": 2988, + "task_loss": 0.742026150226593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9301421642303467, + "epoch": 2.53, + "learning_rate": 3.195266272189349e-05, + "loss": 0.8811, + "step": 2989, + "task_loss": 0.524196445941925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6418254971504211, + "epoch": 2.53, + "learning_rate": 3.194662480376766e-05, + "loss": 0.6905, + "step": 2990, + "task_loss": 0.4861406981945038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7434477806091309, + "epoch": 2.53, + "learning_rate": 3.194058688564183e-05, + "loss": 0.8005, + "step": 2991, + "task_loss": 1.0192149877548218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6885080933570862, + "epoch": 2.53, + "learning_rate": 3.1934548967516007e-05, + "loss": 1.0182, + "step": 2992, + "task_loss": 1.0171197652816772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.048293948173523, + "epoch": 2.53, + "learning_rate": 3.192851104939017e-05, + "loss": 0.9432, + "step": 2993, + "task_loss": 0.9086986780166626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7131342887878418, + "epoch": 2.53, + "learning_rate": 3.192247313126434e-05, + "loss": 0.6426, + "step": 2994, + "task_loss": 1.1337015628814697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2279751300811768, + "epoch": 2.53, + "learning_rate": 3.1916435213138515e-05, + "loss": 0.7557, + "step": 2995, + "task_loss": 1.118836760520935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6218761205673218, + "epoch": 2.53, + "learning_rate": 3.1910397295012675e-05, + "loss": 0.8212, + "step": 2996, + "task_loss": 0.9928039908409119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3336478471755981, + "epoch": 2.53, + "learning_rate": 3.190435937688685e-05, + "loss": 0.7164, + "step": 2997, + "task_loss": 0.604114830493927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9010549187660217, + "epoch": 2.53, + "learning_rate": 3.189832145876102e-05, + "loss": 0.8179, + "step": 2998, + "task_loss": 0.7371923923492432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9644243717193604, + "epoch": 2.53, + "learning_rate": 3.189228354063519e-05, + "loss": 0.8855, + "step": 2999, + "task_loss": 0.88642418384552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0338208675384521, + "epoch": 2.54, + "learning_rate": 3.188624562250936e-05, + "loss": 1.0236, + "step": 3000, + "task_loss": 0.3367408215999603 + }, + { + "epoch": 2.54, + "eval_accuracy": 0.8822574257425743, + "eval_loss": 0.5016362071037292, + "eval_runtime": 227.3911, + "eval_samples_per_second": 111.042, + "eval_steps_per_second": 0.871, + "step": 3000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6137768030166626, + "epoch": 2.54, + "learning_rate": 3.188020770438353e-05, + "loss": 0.7235, + "step": 3001, + "task_loss": 0.271555095911026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6915023326873779, + "epoch": 2.54, + "learning_rate": 3.18741697862577e-05, + "loss": 0.8103, + "step": 3002, + "task_loss": 0.9231040477752686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.659912109375, + "epoch": 2.54, + "learning_rate": 3.1868131868131866e-05, + "loss": 0.7704, + "step": 3003, + "task_loss": 0.9349153637886047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42083296179771423, + "epoch": 2.54, + "learning_rate": 3.186209395000604e-05, + "loss": 0.8225, + "step": 3004, + "task_loss": 0.07340937852859497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5846946239471436, + "epoch": 2.54, + "learning_rate": 3.185605603188021e-05, + "loss": 0.735, + "step": 3005, + "task_loss": 0.9627697467803955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1618759632110596, + "epoch": 2.54, + "learning_rate": 3.1850018113754374e-05, + "loss": 1.0428, + "step": 3006, + "task_loss": 1.5028562545776367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.761713981628418, + "epoch": 2.54, + "learning_rate": 3.184398019562855e-05, + "loss": 0.7175, + "step": 3007, + "task_loss": 0.6191214323043823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7332868576049805, + "epoch": 2.54, + "learning_rate": 3.183794227750272e-05, + "loss": 0.709, + "step": 3008, + "task_loss": 0.5360862016677856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8282040357589722, + "epoch": 2.54, + "learning_rate": 3.183190435937689e-05, + "loss": 0.7121, + "step": 3009, + "task_loss": 1.2541381120681763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6773777008056641, + "epoch": 2.54, + "learning_rate": 3.1825866441251057e-05, + "loss": 0.7403, + "step": 3010, + "task_loss": 0.9211320877075195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8626466989517212, + "epoch": 2.54, + "learning_rate": 3.181982852312523e-05, + "loss": 0.6791, + "step": 3011, + "task_loss": 0.8365532755851746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44061243534088135, + "epoch": 2.55, + "learning_rate": 3.18137906049994e-05, + "loss": 0.4858, + "step": 3012, + "task_loss": 0.30462759733200073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5855493545532227, + "epoch": 2.55, + "learning_rate": 3.1807752686873565e-05, + "loss": 0.7486, + "step": 3013, + "task_loss": 0.5436987280845642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.089611530303955, + "epoch": 2.55, + "learning_rate": 3.180171476874774e-05, + "loss": 0.7826, + "step": 3014, + "task_loss": 0.9176214933395386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8924753665924072, + "epoch": 2.55, + "learning_rate": 3.1795676850621906e-05, + "loss": 0.908, + "step": 3015, + "task_loss": 0.5962092876434326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.076035737991333, + "epoch": 2.55, + "learning_rate": 3.178963893249607e-05, + "loss": 1.1465, + "step": 3016, + "task_loss": 1.4187004566192627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9886857867240906, + "epoch": 2.55, + "learning_rate": 3.178360101437025e-05, + "loss": 1.2795, + "step": 3017, + "task_loss": 1.4709323644638062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9635641574859619, + "epoch": 2.55, + "learning_rate": 3.1777563096244414e-05, + "loss": 0.7376, + "step": 3018, + "task_loss": 0.8006808757781982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.098738431930542, + "epoch": 2.55, + "learning_rate": 3.177152517811859e-05, + "loss": 0.9761, + "step": 3019, + "task_loss": 1.2263888120651245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7483527660369873, + "epoch": 2.55, + "learning_rate": 3.1765487259992755e-05, + "loss": 0.7067, + "step": 3020, + "task_loss": 0.7314642667770386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5958069562911987, + "epoch": 2.55, + "learning_rate": 3.175944934186692e-05, + "loss": 0.733, + "step": 3021, + "task_loss": 0.6028617024421692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5792432427406311, + "epoch": 2.55, + "learning_rate": 3.17534114237411e-05, + "loss": 0.8012, + "step": 3022, + "task_loss": 1.1551947593688965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5220740437507629, + "epoch": 2.56, + "learning_rate": 3.1747373505615264e-05, + "loss": 0.8099, + "step": 3023, + "task_loss": 1.2454135417938232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6018650531768799, + "epoch": 2.56, + "learning_rate": 3.174133558748944e-05, + "loss": 0.6439, + "step": 3024, + "task_loss": 0.16588714718818665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8027732372283936, + "epoch": 2.56, + "learning_rate": 3.1735297669363605e-05, + "loss": 0.7643, + "step": 3025, + "task_loss": 0.8107626438140869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.727070152759552, + "epoch": 2.56, + "learning_rate": 3.172925975123777e-05, + "loss": 0.8428, + "step": 3026, + "task_loss": 0.8587751388549805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7050327658653259, + "epoch": 2.56, + "learning_rate": 3.1723221833111946e-05, + "loss": 0.7917, + "step": 3027, + "task_loss": 0.8078317642211914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5212845802307129, + "epoch": 2.56, + "learning_rate": 3.171718391498611e-05, + "loss": 0.9311, + "step": 3028, + "task_loss": 0.25558459758758545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6377017498016357, + "epoch": 2.56, + "learning_rate": 3.171114599686029e-05, + "loss": 0.8297, + "step": 3029, + "task_loss": 0.8167658448219299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8483272194862366, + "epoch": 2.56, + "learning_rate": 3.1705108078734454e-05, + "loss": 0.6666, + "step": 3030, + "task_loss": 1.0536823272705078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47415128350257874, + "epoch": 2.56, + "learning_rate": 3.169907016060862e-05, + "loss": 0.6084, + "step": 3031, + "task_loss": 0.6789657473564148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.128638505935669, + "epoch": 2.56, + "learning_rate": 3.1693032242482796e-05, + "loss": 1.1184, + "step": 3032, + "task_loss": 1.1000689268112183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7621657252311707, + "epoch": 2.56, + "learning_rate": 3.168699432435696e-05, + "loss": 0.8358, + "step": 3033, + "task_loss": 0.979108989238739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3162500858306885, + "epoch": 2.56, + "learning_rate": 3.168095640623113e-05, + "loss": 1.0163, + "step": 3034, + "task_loss": 2.5295801162719727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7265914678573608, + "epoch": 2.57, + "learning_rate": 3.1674918488105304e-05, + "loss": 0.8471, + "step": 3035, + "task_loss": 1.5632015466690063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7244623303413391, + "epoch": 2.57, + "learning_rate": 3.166888056997947e-05, + "loss": 0.86, + "step": 3036, + "task_loss": 1.9515035152435303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5082241892814636, + "epoch": 2.57, + "learning_rate": 3.166284265185364e-05, + "loss": 0.6857, + "step": 3037, + "task_loss": 0.49375560879707336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5705839395523071, + "epoch": 2.57, + "learning_rate": 3.165680473372781e-05, + "loss": 0.5545, + "step": 3038, + "task_loss": 1.2205630540847778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9945900440216064, + "epoch": 2.57, + "learning_rate": 3.1650766815601986e-05, + "loss": 0.7314, + "step": 3039, + "task_loss": 1.4323033094406128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7085701823234558, + "epoch": 2.57, + "learning_rate": 3.164472889747615e-05, + "loss": 0.8374, + "step": 3040, + "task_loss": 1.307729721069336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.029331922531128, + "epoch": 2.57, + "learning_rate": 3.163869097935032e-05, + "loss": 0.8646, + "step": 3041, + "task_loss": 1.0929069519042969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8046635389328003, + "epoch": 2.57, + "learning_rate": 3.1632653061224494e-05, + "loss": 0.7035, + "step": 3042, + "task_loss": 1.3054965734481812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.9130637645721436, + "epoch": 2.57, + "learning_rate": 3.162661514309866e-05, + "loss": 1.1315, + "step": 3043, + "task_loss": 1.776525616645813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7463591694831848, + "epoch": 2.57, + "learning_rate": 3.162057722497283e-05, + "loss": 0.9202, + "step": 3044, + "task_loss": 0.7726787328720093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7884398698806763, + "epoch": 2.57, + "learning_rate": 3.1614539306847e-05, + "loss": 0.9153, + "step": 3045, + "task_loss": 0.7687554359436035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7593898773193359, + "epoch": 2.57, + "learning_rate": 3.160850138872117e-05, + "loss": 0.7302, + "step": 3046, + "task_loss": 0.28603827953338623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8466780185699463, + "epoch": 2.58, + "learning_rate": 3.160246347059534e-05, + "loss": 0.7639, + "step": 3047, + "task_loss": 0.5504293441772461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9991329312324524, + "epoch": 2.58, + "learning_rate": 3.159642555246951e-05, + "loss": 0.8233, + "step": 3048, + "task_loss": 1.758773922920227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8021063208580017, + "epoch": 2.58, + "learning_rate": 3.1590387634343685e-05, + "loss": 0.8472, + "step": 3049, + "task_loss": 1.1081922054290771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6507203578948975, + "epoch": 2.58, + "learning_rate": 3.1584349716217846e-05, + "loss": 1.0514, + "step": 3050, + "task_loss": 0.5546426773071289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7957897782325745, + "epoch": 2.58, + "learning_rate": 3.157831179809202e-05, + "loss": 0.7781, + "step": 3051, + "task_loss": 1.141638994216919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5324211120605469, + "epoch": 2.58, + "learning_rate": 3.1572273879966193e-05, + "loss": 0.9575, + "step": 3052, + "task_loss": 1.2046325206756592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45889613032341003, + "epoch": 2.58, + "learning_rate": 3.1566235961840354e-05, + "loss": 0.7687, + "step": 3053, + "task_loss": 0.9140346646308899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4761022925376892, + "epoch": 2.58, + "learning_rate": 3.156019804371453e-05, + "loss": 0.6024, + "step": 3054, + "task_loss": 0.8140660524368286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7592402696609497, + "epoch": 2.58, + "learning_rate": 3.15541601255887e-05, + "loss": 1.0425, + "step": 3055, + "task_loss": 1.0749025344848633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4261313676834106, + "epoch": 2.58, + "learning_rate": 3.154812220746287e-05, + "loss": 0.8604, + "step": 3056, + "task_loss": 1.2256821393966675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38518714904785156, + "epoch": 2.58, + "learning_rate": 3.1542084289337036e-05, + "loss": 0.6433, + "step": 3057, + "task_loss": 0.4681018888950348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7753989696502686, + "epoch": 2.58, + "learning_rate": 3.153604637121121e-05, + "loss": 0.7666, + "step": 3058, + "task_loss": 1.3992558717727661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23168912529945374, + "epoch": 2.59, + "learning_rate": 3.153000845308538e-05, + "loss": 0.6235, + "step": 3059, + "task_loss": 0.07382439821958542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5133727788925171, + "epoch": 2.59, + "learning_rate": 3.1523970534959544e-05, + "loss": 0.795, + "step": 3060, + "task_loss": 0.8130411505699158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.837613582611084, + "epoch": 2.59, + "learning_rate": 3.151793261683372e-05, + "loss": 0.8778, + "step": 3061, + "task_loss": 0.8012933135032654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7988501787185669, + "epoch": 2.59, + "learning_rate": 3.1511894698707886e-05, + "loss": 0.8781, + "step": 3062, + "task_loss": 1.2246960401535034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5927332639694214, + "epoch": 2.59, + "learning_rate": 3.150585678058205e-05, + "loss": 0.8893, + "step": 3063, + "task_loss": 1.0391918420791626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3367778062820435, + "epoch": 2.59, + "learning_rate": 3.149981886245623e-05, + "loss": 1.1085, + "step": 3064, + "task_loss": 1.1869428157806396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.576911211013794, + "epoch": 2.59, + "learning_rate": 3.14937809443304e-05, + "loss": 1.0024, + "step": 3065, + "task_loss": 1.638088583946228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9077416062355042, + "epoch": 2.59, + "learning_rate": 3.148774302620456e-05, + "loss": 0.7884, + "step": 3066, + "task_loss": 1.0289536714553833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8118168711662292, + "epoch": 2.59, + "learning_rate": 3.1481705108078735e-05, + "loss": 0.82, + "step": 3067, + "task_loss": 0.5601198077201843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7285646200180054, + "epoch": 2.59, + "learning_rate": 3.147566718995291e-05, + "loss": 0.8963, + "step": 3068, + "task_loss": 0.9830752015113831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6709139943122864, + "epoch": 2.59, + "learning_rate": 3.1469629271827076e-05, + "loss": 0.886, + "step": 3069, + "task_loss": 1.1587276458740234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.618017852306366, + "epoch": 2.59, + "learning_rate": 3.146359135370124e-05, + "loss": 0.6264, + "step": 3070, + "task_loss": 0.818087100982666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8378186225891113, + "epoch": 2.6, + "learning_rate": 3.145755343557542e-05, + "loss": 0.9971, + "step": 3071, + "task_loss": 1.3869907855987549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0918958187103271, + "epoch": 2.6, + "learning_rate": 3.1451515517449585e-05, + "loss": 0.9418, + "step": 3072, + "task_loss": 0.6294479370117188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5628905892372131, + "epoch": 2.6, + "learning_rate": 3.144547759932375e-05, + "loss": 0.8751, + "step": 3073, + "task_loss": 0.973656177520752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5180924534797668, + "epoch": 2.6, + "learning_rate": 3.1439439681197926e-05, + "loss": 0.7902, + "step": 3074, + "task_loss": 0.3605163097381592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7125868201255798, + "epoch": 2.6, + "learning_rate": 3.143340176307209e-05, + "loss": 0.8437, + "step": 3075, + "task_loss": 0.8415375351905823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6082790493965149, + "epoch": 2.6, + "learning_rate": 3.142736384494626e-05, + "loss": 0.7835, + "step": 3076, + "task_loss": 1.2885019779205322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38996610045433044, + "epoch": 2.6, + "learning_rate": 3.1421325926820434e-05, + "loss": 0.7277, + "step": 3077, + "task_loss": 1.1668273210525513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6524943709373474, + "epoch": 2.6, + "learning_rate": 3.14152880086946e-05, + "loss": 0.6903, + "step": 3078, + "task_loss": 0.7020797729492188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7283225059509277, + "epoch": 2.6, + "learning_rate": 3.1409250090568775e-05, + "loss": 0.7024, + "step": 3079, + "task_loss": 1.188352108001709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7896183729171753, + "epoch": 2.6, + "learning_rate": 3.140321217244294e-05, + "loss": 0.9581, + "step": 3080, + "task_loss": 0.18475015461444855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9431156516075134, + "epoch": 2.6, + "learning_rate": 3.1397174254317116e-05, + "loss": 0.7703, + "step": 3081, + "task_loss": 1.1637721061706543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8105146884918213, + "epoch": 2.6, + "learning_rate": 3.1391136336191283e-05, + "loss": 0.7755, + "step": 3082, + "task_loss": 0.6414182186126709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6288378238677979, + "epoch": 2.61, + "learning_rate": 3.138509841806545e-05, + "loss": 0.7791, + "step": 3083, + "task_loss": 0.66705322265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7495559453964233, + "epoch": 2.61, + "learning_rate": 3.1379060499939625e-05, + "loss": 0.6502, + "step": 3084, + "task_loss": 1.3439288139343262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8147602081298828, + "epoch": 2.61, + "learning_rate": 3.137302258181379e-05, + "loss": 0.8291, + "step": 3085, + "task_loss": 0.8545053601264954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0447710752487183, + "epoch": 2.61, + "learning_rate": 3.136698466368796e-05, + "loss": 0.7472, + "step": 3086, + "task_loss": 0.6808056831359863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38031962513923645, + "epoch": 2.61, + "learning_rate": 3.136094674556213e-05, + "loss": 0.6652, + "step": 3087, + "task_loss": 0.2978646159172058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5546494722366333, + "epoch": 2.61, + "learning_rate": 3.13549088274363e-05, + "loss": 0.8103, + "step": 3088, + "task_loss": 0.8973349928855896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1533368825912476, + "epoch": 2.61, + "learning_rate": 3.1348870909310474e-05, + "loss": 0.8295, + "step": 3089, + "task_loss": 0.9379809498786926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5743615627288818, + "epoch": 2.61, + "learning_rate": 3.134283299118464e-05, + "loss": 0.7668, + "step": 3090, + "task_loss": 0.7315385341644287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8335127830505371, + "epoch": 2.61, + "learning_rate": 3.133679507305881e-05, + "loss": 0.6969, + "step": 3091, + "task_loss": 0.3726682662963867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.624243974685669, + "epoch": 2.61, + "learning_rate": 3.133075715493298e-05, + "loss": 0.6344, + "step": 3092, + "task_loss": 0.9143445491790771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0778112411499023, + "epoch": 2.61, + "learning_rate": 3.132471923680715e-05, + "loss": 0.8574, + "step": 3093, + "task_loss": 1.0519269704818726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1987361907958984, + "epoch": 2.61, + "learning_rate": 3.131868131868132e-05, + "loss": 0.8806, + "step": 3094, + "task_loss": 1.0485796928405762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6606401801109314, + "epoch": 2.62, + "learning_rate": 3.131264340055549e-05, + "loss": 0.7936, + "step": 3095, + "task_loss": 0.9130484461784363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4712564945220947, + "epoch": 2.62, + "learning_rate": 3.130660548242966e-05, + "loss": 0.6055, + "step": 3096, + "task_loss": 0.4532923698425293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.084006667137146, + "epoch": 2.62, + "learning_rate": 3.130056756430383e-05, + "loss": 0.757, + "step": 3097, + "task_loss": 0.8979833722114563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5892452597618103, + "epoch": 2.62, + "learning_rate": 3.1294529646178e-05, + "loss": 0.7548, + "step": 3098, + "task_loss": 1.4288254976272583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7405765056610107, + "epoch": 2.62, + "learning_rate": 3.128849172805217e-05, + "loss": 0.7866, + "step": 3099, + "task_loss": 0.9639761447906494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.563779890537262, + "epoch": 2.62, + "learning_rate": 3.128245380992634e-05, + "loss": 0.7741, + "step": 3100, + "task_loss": 0.5200707316398621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2534862756729126, + "epoch": 2.62, + "learning_rate": 3.127641589180051e-05, + "loss": 0.7147, + "step": 3101, + "task_loss": 2.0381574630737305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7990745306015015, + "epoch": 2.62, + "learning_rate": 3.127037797367468e-05, + "loss": 0.7792, + "step": 3102, + "task_loss": 1.1806010007858276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3845534920692444, + "epoch": 2.62, + "learning_rate": 3.126434005554885e-05, + "loss": 0.6327, + "step": 3103, + "task_loss": 0.2133253514766693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.693825364112854, + "epoch": 2.62, + "learning_rate": 3.1258302137423016e-05, + "loss": 0.7545, + "step": 3104, + "task_loss": 0.6199763417243958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5323759317398071, + "epoch": 2.62, + "learning_rate": 3.125226421929719e-05, + "loss": 0.7527, + "step": 3105, + "task_loss": 0.2206573635339737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4812619984149933, + "epoch": 2.63, + "learning_rate": 3.124622630117136e-05, + "loss": 0.7548, + "step": 3106, + "task_loss": 0.27425816655158997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.792110025882721, + "epoch": 2.63, + "learning_rate": 3.1240188383045524e-05, + "loss": 0.9064, + "step": 3107, + "task_loss": 0.28309768438339233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7624838352203369, + "epoch": 2.63, + "learning_rate": 3.12341504649197e-05, + "loss": 0.7936, + "step": 3108, + "task_loss": 0.527056097984314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6111307144165039, + "epoch": 2.63, + "learning_rate": 3.122811254679387e-05, + "loss": 0.8503, + "step": 3109, + "task_loss": 1.0702322721481323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5091174840927124, + "epoch": 2.63, + "learning_rate": 3.122207462866803e-05, + "loss": 0.6543, + "step": 3110, + "task_loss": 0.42716890573501587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4714750647544861, + "epoch": 2.63, + "learning_rate": 3.1216036710542206e-05, + "loss": 0.6903, + "step": 3111, + "task_loss": 1.036287784576416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8974889516830444, + "epoch": 2.63, + "learning_rate": 3.120999879241638e-05, + "loss": 0.837, + "step": 3112, + "task_loss": 1.0372660160064697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5923638343811035, + "epoch": 2.63, + "learning_rate": 3.120396087429055e-05, + "loss": 0.613, + "step": 3113, + "task_loss": 0.47694918513298035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.103044033050537, + "epoch": 2.63, + "learning_rate": 3.1197922956164715e-05, + "loss": 0.832, + "step": 3114, + "task_loss": 2.234790086746216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5502097606658936, + "epoch": 2.63, + "learning_rate": 3.119188503803889e-05, + "loss": 0.8846, + "step": 3115, + "task_loss": 0.8045153021812439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7336527109146118, + "epoch": 2.63, + "learning_rate": 3.1185847119913056e-05, + "loss": 0.7745, + "step": 3116, + "task_loss": 0.747377336025238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6990717053413391, + "epoch": 2.63, + "learning_rate": 3.117980920178722e-05, + "loss": 0.7442, + "step": 3117, + "task_loss": 1.1197246313095093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4856246709823608, + "epoch": 2.64, + "learning_rate": 3.11737712836614e-05, + "loss": 1.1407, + "step": 3118, + "task_loss": 2.2340946197509766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5579985976219177, + "epoch": 2.64, + "learning_rate": 3.1167733365535564e-05, + "loss": 0.7828, + "step": 3119, + "task_loss": 0.27241620421409607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5260222554206848, + "epoch": 2.64, + "learning_rate": 3.116169544740973e-05, + "loss": 0.7017, + "step": 3120, + "task_loss": 0.06501312553882599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5153244137763977, + "epoch": 2.64, + "learning_rate": 3.1155657529283905e-05, + "loss": 0.6811, + "step": 3121, + "task_loss": 0.1749415397644043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6429804563522339, + "epoch": 2.64, + "learning_rate": 3.114961961115808e-05, + "loss": 0.8026, + "step": 3122, + "task_loss": 0.09109488129615784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7604629993438721, + "epoch": 2.64, + "learning_rate": 3.114358169303224e-05, + "loss": 0.749, + "step": 3123, + "task_loss": 0.7961565256118774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6233325004577637, + "epoch": 2.64, + "learning_rate": 3.1137543774906414e-05, + "loss": 0.7718, + "step": 3124, + "task_loss": 0.48125159740448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2885315418243408, + "epoch": 2.64, + "learning_rate": 3.113150585678059e-05, + "loss": 0.936, + "step": 3125, + "task_loss": 1.109978437423706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3276715576648712, + "epoch": 2.64, + "learning_rate": 3.112546793865475e-05, + "loss": 0.8305, + "step": 3126, + "task_loss": 0.11410202831029892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36628955602645874, + "epoch": 2.64, + "learning_rate": 3.111943002052892e-05, + "loss": 0.69, + "step": 3127, + "task_loss": 0.5616279244422913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8504805564880371, + "epoch": 2.64, + "learning_rate": 3.1113392102403096e-05, + "loss": 0.8499, + "step": 3128, + "task_loss": 0.8784685730934143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3713712692260742, + "epoch": 2.64, + "learning_rate": 3.110735418427726e-05, + "loss": 0.8338, + "step": 3129, + "task_loss": 0.27613407373428345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8702806234359741, + "epoch": 2.65, + "learning_rate": 3.110131626615143e-05, + "loss": 0.6871, + "step": 3130, + "task_loss": 1.1232000589370728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7637176513671875, + "epoch": 2.65, + "learning_rate": 3.1095278348025604e-05, + "loss": 0.7968, + "step": 3131, + "task_loss": 0.7490792274475098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6001620888710022, + "epoch": 2.65, + "learning_rate": 3.108924042989977e-05, + "loss": 0.9106, + "step": 3132, + "task_loss": 0.17001810669898987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0916117429733276, + "epoch": 2.65, + "learning_rate": 3.108320251177394e-05, + "loss": 0.7219, + "step": 3133, + "task_loss": 0.7988651990890503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6563551425933838, + "epoch": 2.65, + "learning_rate": 3.107716459364811e-05, + "loss": 0.9079, + "step": 3134, + "task_loss": 1.3962843418121338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8960903286933899, + "epoch": 2.65, + "learning_rate": 3.107112667552228e-05, + "loss": 0.9444, + "step": 3135, + "task_loss": 1.1886022090911865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6900730133056641, + "epoch": 2.65, + "learning_rate": 3.106508875739645e-05, + "loss": 0.7975, + "step": 3136, + "task_loss": 0.6726140975952148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.432685911655426, + "epoch": 2.65, + "learning_rate": 3.105905083927062e-05, + "loss": 0.5401, + "step": 3137, + "task_loss": 0.6507964134216309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.711495041847229, + "epoch": 2.65, + "learning_rate": 3.1053012921144795e-05, + "loss": 1.0285, + "step": 3138, + "task_loss": 1.0023207664489746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.820142388343811, + "epoch": 2.65, + "learning_rate": 3.104697500301896e-05, + "loss": 0.8467, + "step": 3139, + "task_loss": 1.6331762075424194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2314832210540771, + "epoch": 2.65, + "learning_rate": 3.104093708489313e-05, + "loss": 0.8991, + "step": 3140, + "task_loss": 1.2430920600891113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.758161187171936, + "epoch": 2.65, + "learning_rate": 3.10348991667673e-05, + "loss": 0.794, + "step": 3141, + "task_loss": 1.0795671939849854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4688737392425537, + "epoch": 2.66, + "learning_rate": 3.102886124864147e-05, + "loss": 0.9038, + "step": 3142, + "task_loss": 0.23454007506370544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8360175490379333, + "epoch": 2.66, + "learning_rate": 3.102282333051564e-05, + "loss": 0.8185, + "step": 3143, + "task_loss": 1.3201662302017212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5686139464378357, + "epoch": 2.66, + "learning_rate": 3.101678541238981e-05, + "loss": 0.7297, + "step": 3144, + "task_loss": 0.3480425179004669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8402721285820007, + "epoch": 2.66, + "learning_rate": 3.101074749426398e-05, + "loss": 0.7674, + "step": 3145, + "task_loss": 0.8077020645141602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5086648464202881, + "epoch": 2.66, + "learning_rate": 3.1004709576138146e-05, + "loss": 0.7615, + "step": 3146, + "task_loss": 1.242014765739441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1624759435653687, + "epoch": 2.66, + "learning_rate": 3.099867165801232e-05, + "loss": 0.8184, + "step": 3147, + "task_loss": 1.2872920036315918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4037091135978699, + "epoch": 2.66, + "learning_rate": 3.099263373988649e-05, + "loss": 0.7922, + "step": 3148, + "task_loss": 0.31010016798973083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0317448377609253, + "epoch": 2.66, + "learning_rate": 3.098659582176066e-05, + "loss": 0.8005, + "step": 3149, + "task_loss": 0.6081992983818054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5967278480529785, + "epoch": 2.66, + "learning_rate": 3.098055790363483e-05, + "loss": 0.6801, + "step": 3150, + "task_loss": 0.9046477675437927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7411437630653381, + "epoch": 2.66, + "learning_rate": 3.0974519985508995e-05, + "loss": 0.9658, + "step": 3151, + "task_loss": 0.948363721370697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0255472660064697, + "epoch": 2.66, + "learning_rate": 3.096848206738317e-05, + "loss": 0.7736, + "step": 3152, + "task_loss": 0.9569589495658875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6174736618995667, + "epoch": 2.66, + "learning_rate": 3.0962444149257336e-05, + "loss": 0.6975, + "step": 3153, + "task_loss": 0.7506214380264282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0273759365081787, + "epoch": 2.67, + "learning_rate": 3.095640623113151e-05, + "loss": 0.7917, + "step": 3154, + "task_loss": 0.5526015758514404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8444082736968994, + "epoch": 2.67, + "learning_rate": 3.095036831300568e-05, + "loss": 0.8802, + "step": 3155, + "task_loss": 0.8181344270706177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.035127878189087, + "epoch": 2.67, + "learning_rate": 3.0944330394879845e-05, + "loss": 1.0111, + "step": 3156, + "task_loss": 0.9723500609397888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6508625745773315, + "epoch": 2.67, + "learning_rate": 3.093829247675402e-05, + "loss": 0.6386, + "step": 3157, + "task_loss": 0.2908296585083008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8350207209587097, + "epoch": 2.67, + "learning_rate": 3.0932254558628186e-05, + "loss": 1.164, + "step": 3158, + "task_loss": 1.3280986547470093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0200244188308716, + "epoch": 2.67, + "learning_rate": 3.092621664050236e-05, + "loss": 0.8382, + "step": 3159, + "task_loss": 1.6663730144500732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.043365478515625, + "epoch": 2.67, + "learning_rate": 3.092017872237653e-05, + "loss": 0.8781, + "step": 3160, + "task_loss": 0.9484541416168213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9852643013000488, + "epoch": 2.67, + "learning_rate": 3.0914140804250694e-05, + "loss": 0.7058, + "step": 3161, + "task_loss": 1.019930362701416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.070119857788086, + "epoch": 2.67, + "learning_rate": 3.090810288612487e-05, + "loss": 0.7159, + "step": 3162, + "task_loss": 1.010020136833191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8089014887809753, + "epoch": 2.67, + "learning_rate": 3.0902064967999035e-05, + "loss": 0.554, + "step": 3163, + "task_loss": 1.0365593433380127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9561063647270203, + "epoch": 2.67, + "learning_rate": 3.08960270498732e-05, + "loss": 0.7181, + "step": 3164, + "task_loss": 1.5164334774017334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6077285408973694, + "epoch": 2.67, + "learning_rate": 3.0889989131747376e-05, + "loss": 0.5554, + "step": 3165, + "task_loss": 1.8480230569839478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0509960651397705, + "epoch": 2.68, + "learning_rate": 3.0883951213621544e-05, + "loss": 0.7112, + "step": 3166, + "task_loss": 1.5490612983703613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0542272329330444, + "epoch": 2.68, + "learning_rate": 3.087791329549571e-05, + "loss": 0.7934, + "step": 3167, + "task_loss": 1.0934957265853882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5000607967376709, + "epoch": 2.68, + "learning_rate": 3.0871875377369885e-05, + "loss": 0.7991, + "step": 3168, + "task_loss": 0.08335362374782562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7116163969039917, + "epoch": 2.68, + "learning_rate": 3.086583745924406e-05, + "loss": 0.8771, + "step": 3169, + "task_loss": 0.7051525712013245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.104564905166626, + "epoch": 2.68, + "learning_rate": 3.0859799541118226e-05, + "loss": 0.8047, + "step": 3170, + "task_loss": 1.144094705581665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7311936616897583, + "epoch": 2.68, + "learning_rate": 3.085376162299239e-05, + "loss": 0.7885, + "step": 3171, + "task_loss": 0.7517529129981995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1027672290802002, + "epoch": 2.68, + "learning_rate": 3.084772370486657e-05, + "loss": 0.9567, + "step": 3172, + "task_loss": 1.1185749769210815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4500886797904968, + "epoch": 2.68, + "learning_rate": 3.0841685786740734e-05, + "loss": 0.8539, + "step": 3173, + "task_loss": 0.30752894282341003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.459417462348938, + "epoch": 2.68, + "learning_rate": 3.08356478686149e-05, + "loss": 0.7742, + "step": 3174, + "task_loss": 1.277453064918518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6754051446914673, + "epoch": 2.68, + "learning_rate": 3.0829609950489075e-05, + "loss": 0.6208, + "step": 3175, + "task_loss": 1.224611520767212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34234631061553955, + "epoch": 2.68, + "learning_rate": 3.082357203236324e-05, + "loss": 0.6069, + "step": 3176, + "task_loss": 0.8445130586624146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31016695499420166, + "epoch": 2.69, + "learning_rate": 3.081753411423741e-05, + "loss": 0.7175, + "step": 3177, + "task_loss": 0.760482668876648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8396310806274414, + "epoch": 2.69, + "learning_rate": 3.0811496196111584e-05, + "loss": 0.8082, + "step": 3178, + "task_loss": 1.9849838018417358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7979298830032349, + "epoch": 2.69, + "learning_rate": 3.080545827798575e-05, + "loss": 0.7563, + "step": 3179, + "task_loss": 1.4667794704437256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6752405166625977, + "epoch": 2.69, + "learning_rate": 3.079942035985992e-05, + "loss": 0.6403, + "step": 3180, + "task_loss": 0.5522093176841736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6803416013717651, + "epoch": 2.69, + "learning_rate": 3.079338244173409e-05, + "loss": 0.8373, + "step": 3181, + "task_loss": 0.7312726974487305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5461660027503967, + "epoch": 2.69, + "learning_rate": 3.0787344523608266e-05, + "loss": 0.6832, + "step": 3182, + "task_loss": 0.25080814957618713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5495463609695435, + "epoch": 2.69, + "learning_rate": 3.0781306605482426e-05, + "loss": 0.5325, + "step": 3183, + "task_loss": 0.34389790892601013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7502367496490479, + "epoch": 2.69, + "learning_rate": 3.07752686873566e-05, + "loss": 0.7234, + "step": 3184, + "task_loss": 0.412925660610199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9324972033500671, + "epoch": 2.69, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.7835, + "step": 3185, + "task_loss": 0.6428356170654297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5306106805801392, + "epoch": 2.69, + "learning_rate": 3.076319285110494e-05, + "loss": 0.5332, + "step": 3186, + "task_loss": 0.39461106061935425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8705208897590637, + "epoch": 2.69, + "learning_rate": 3.075715493297911e-05, + "loss": 0.8455, + "step": 3187, + "task_loss": 0.6035874485969543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9159365296363831, + "epoch": 2.69, + "learning_rate": 3.075111701485328e-05, + "loss": 0.8895, + "step": 3188, + "task_loss": 1.9215211868286133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6719887256622314, + "epoch": 2.7, + "learning_rate": 3.074507909672745e-05, + "loss": 0.6852, + "step": 3189, + "task_loss": 0.3079150915145874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9369650483131409, + "epoch": 2.7, + "learning_rate": 3.073904117860162e-05, + "loss": 0.7867, + "step": 3190, + "task_loss": 0.6321751475334167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3319016695022583, + "epoch": 2.7, + "learning_rate": 3.073300326047579e-05, + "loss": 0.5293, + "step": 3191, + "task_loss": 0.38906458020210266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4013788402080536, + "epoch": 2.7, + "learning_rate": 3.072696534234996e-05, + "loss": 0.7443, + "step": 3192, + "task_loss": 0.6293163299560547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6085204482078552, + "epoch": 2.7, + "learning_rate": 3.0720927424224125e-05, + "loss": 0.8517, + "step": 3193, + "task_loss": 1.3240139484405518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5559515953063965, + "epoch": 2.7, + "learning_rate": 3.07148895060983e-05, + "loss": 0.6212, + "step": 3194, + "task_loss": 0.6982637047767639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8499322533607483, + "epoch": 2.7, + "learning_rate": 3.070885158797247e-05, + "loss": 0.6071, + "step": 3195, + "task_loss": 0.9296495318412781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6830109357833862, + "epoch": 2.7, + "learning_rate": 3.0702813669846634e-05, + "loss": 0.6985, + "step": 3196, + "task_loss": 0.625149130821228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6141483187675476, + "epoch": 2.7, + "learning_rate": 3.069677575172081e-05, + "loss": 0.6839, + "step": 3197, + "task_loss": 0.5352165102958679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6265395283699036, + "epoch": 2.7, + "learning_rate": 3.069073783359498e-05, + "loss": 0.6138, + "step": 3198, + "task_loss": 0.4455566108226776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.591829776763916, + "epoch": 2.7, + "learning_rate": 3.068469991546914e-05, + "loss": 0.7552, + "step": 3199, + "task_loss": 0.6086850762367249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8524900078773499, + "epoch": 2.7, + "learning_rate": 3.0678661997343316e-05, + "loss": 0.7744, + "step": 3200, + "task_loss": 1.5730679035186768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6828543543815613, + "epoch": 2.71, + "learning_rate": 3.067262407921749e-05, + "loss": 0.6728, + "step": 3201, + "task_loss": 1.2370249032974243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8591368198394775, + "epoch": 2.71, + "learning_rate": 3.066658616109166e-05, + "loss": 0.7115, + "step": 3202, + "task_loss": 1.4544029235839844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5472726821899414, + "epoch": 2.71, + "learning_rate": 3.0660548242965824e-05, + "loss": 0.6814, + "step": 3203, + "task_loss": 0.6061931848526001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.664957582950592, + "epoch": 2.71, + "learning_rate": 3.065451032484e-05, + "loss": 0.5888, + "step": 3204, + "task_loss": 0.6862332820892334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7381258010864258, + "epoch": 2.71, + "learning_rate": 3.0648472406714165e-05, + "loss": 0.6844, + "step": 3205, + "task_loss": 0.5976237654685974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8357290029525757, + "epoch": 2.71, + "learning_rate": 3.064243448858833e-05, + "loss": 0.7789, + "step": 3206, + "task_loss": 1.7306406497955322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1574708223342896, + "epoch": 2.71, + "learning_rate": 3.0636396570462507e-05, + "loss": 0.8458, + "step": 3207, + "task_loss": 0.46113723516464233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42682120203971863, + "epoch": 2.71, + "learning_rate": 3.0630358652336674e-05, + "loss": 0.7908, + "step": 3208, + "task_loss": 0.44245457649230957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8901224136352539, + "epoch": 2.71, + "learning_rate": 3.062432073421084e-05, + "loss": 0.9076, + "step": 3209, + "task_loss": 1.9688218832015991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.490556538105011, + "epoch": 2.71, + "learning_rate": 3.0618282816085015e-05, + "loss": 0.5647, + "step": 3210, + "task_loss": 0.09024570137262344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8634461164474487, + "epoch": 2.71, + "learning_rate": 3.061224489795919e-05, + "loss": 0.7147, + "step": 3211, + "task_loss": 0.4070153534412384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9406939744949341, + "epoch": 2.71, + "learning_rate": 3.0606206979833356e-05, + "loss": 0.8057, + "step": 3212, + "task_loss": 0.8827047348022461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6856909394264221, + "epoch": 2.72, + "learning_rate": 3.060016906170752e-05, + "loss": 0.5144, + "step": 3213, + "task_loss": 0.2870550751686096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5347412824630737, + "epoch": 2.72, + "learning_rate": 3.05941311435817e-05, + "loss": 0.7348, + "step": 3214, + "task_loss": 0.6001242399215698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8324074745178223, + "epoch": 2.72, + "learning_rate": 3.0588093225455864e-05, + "loss": 0.7328, + "step": 3215, + "task_loss": 0.47560784220695496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7889832258224487, + "epoch": 2.72, + "learning_rate": 3.058205530733003e-05, + "loss": 0.8431, + "step": 3216, + "task_loss": 0.6564210057258606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0889413356781006, + "epoch": 2.72, + "learning_rate": 3.0576017389204206e-05, + "loss": 0.7747, + "step": 3217, + "task_loss": 0.5277644991874695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5270012617111206, + "epoch": 2.72, + "learning_rate": 3.056997947107837e-05, + "loss": 0.8013, + "step": 3218, + "task_loss": 0.5058249235153198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6184274554252625, + "epoch": 2.72, + "learning_rate": 3.056394155295254e-05, + "loss": 0.7006, + "step": 3219, + "task_loss": 0.2727234959602356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6865006685256958, + "epoch": 2.72, + "learning_rate": 3.0557903634826714e-05, + "loss": 0.6968, + "step": 3220, + "task_loss": 0.3609784245491028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5488929748535156, + "epoch": 2.72, + "learning_rate": 3.055186571670088e-05, + "loss": 0.6205, + "step": 3221, + "task_loss": 0.632896900177002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5753695964813232, + "epoch": 2.72, + "learning_rate": 3.0545827798575055e-05, + "loss": 0.5562, + "step": 3222, + "task_loss": 1.0328339338302612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9624415040016174, + "epoch": 2.72, + "learning_rate": 3.053978988044922e-05, + "loss": 0.8698, + "step": 3223, + "task_loss": 1.5791884660720825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.08939790725708, + "epoch": 2.72, + "learning_rate": 3.053375196232339e-05, + "loss": 1.0014, + "step": 3224, + "task_loss": 1.1648470163345337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0841717720031738, + "epoch": 2.73, + "learning_rate": 3.052771404419756e-05, + "loss": 0.8551, + "step": 3225, + "task_loss": 0.5503373146057129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9547292590141296, + "epoch": 2.73, + "learning_rate": 3.052167612607173e-05, + "loss": 0.9845, + "step": 3226, + "task_loss": 1.687423825263977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.743256688117981, + "epoch": 2.73, + "learning_rate": 3.05156382079459e-05, + "loss": 0.849, + "step": 3227, + "task_loss": 0.6106722354888916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6666925549507141, + "epoch": 2.73, + "learning_rate": 3.050960028982007e-05, + "loss": 0.8242, + "step": 3228, + "task_loss": 0.9376764297485352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6793197393417358, + "epoch": 2.73, + "learning_rate": 3.050356237169424e-05, + "loss": 0.877, + "step": 3229, + "task_loss": 0.5538238286972046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.923708438873291, + "epoch": 2.73, + "learning_rate": 3.049752445356841e-05, + "loss": 1.0121, + "step": 3230, + "task_loss": 1.5856703519821167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8031929135322571, + "epoch": 2.73, + "learning_rate": 3.049148653544258e-05, + "loss": 0.7474, + "step": 3231, + "task_loss": 1.0973801612854004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6770730018615723, + "epoch": 2.73, + "learning_rate": 3.0485448617316754e-05, + "loss": 0.646, + "step": 3232, + "task_loss": 0.7020218372344971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3898873031139374, + "epoch": 2.73, + "learning_rate": 3.0479410699190918e-05, + "loss": 0.507, + "step": 3233, + "task_loss": 0.388008713722229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.958076000213623, + "epoch": 2.73, + "learning_rate": 3.047337278106509e-05, + "loss": 0.7033, + "step": 3234, + "task_loss": 0.3326040804386139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5524904727935791, + "epoch": 2.73, + "learning_rate": 3.0467334862939262e-05, + "loss": 0.6663, + "step": 3235, + "task_loss": 0.5325053334236145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5622840523719788, + "epoch": 2.73, + "learning_rate": 3.046129694481343e-05, + "loss": 0.7751, + "step": 3236, + "task_loss": 0.34747156500816345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6312466859817505, + "epoch": 2.74, + "learning_rate": 3.04552590266876e-05, + "loss": 0.6095, + "step": 3237, + "task_loss": 0.14888127148151398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6224997639656067, + "epoch": 2.74, + "learning_rate": 3.044922110856177e-05, + "loss": 0.7414, + "step": 3238, + "task_loss": 0.4874679744243622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29986628890037537, + "epoch": 2.74, + "learning_rate": 3.0443183190435938e-05, + "loss": 0.6452, + "step": 3239, + "task_loss": 0.31125733256340027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4303237795829773, + "epoch": 2.74, + "learning_rate": 3.043714527231011e-05, + "loss": 0.624, + "step": 3240, + "task_loss": 1.8085837364196777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9526863098144531, + "epoch": 2.74, + "learning_rate": 3.043110735418428e-05, + "loss": 0.7177, + "step": 3241, + "task_loss": 0.744300901889801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8251503705978394, + "epoch": 2.74, + "learning_rate": 3.042506943605845e-05, + "loss": 0.667, + "step": 3242, + "task_loss": 0.38434773683547974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4565238356590271, + "epoch": 2.74, + "learning_rate": 3.0419031517932617e-05, + "loss": 0.6573, + "step": 3243, + "task_loss": 0.513244092464447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.086967945098877, + "epoch": 2.74, + "learning_rate": 3.0412993599806787e-05, + "loss": 1.0609, + "step": 3244, + "task_loss": 0.7730523943901062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4086042642593384, + "epoch": 2.74, + "learning_rate": 3.040695568168096e-05, + "loss": 0.8459, + "step": 3245, + "task_loss": 1.009049415588379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.665966808795929, + "epoch": 2.74, + "learning_rate": 3.0400917763555125e-05, + "loss": 0.7695, + "step": 3246, + "task_loss": 1.1233670711517334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.559831976890564, + "epoch": 2.74, + "learning_rate": 3.0394879845429296e-05, + "loss": 0.6049, + "step": 3247, + "task_loss": 0.3660418391227722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6235067844390869, + "epoch": 2.75, + "learning_rate": 3.038884192730347e-05, + "loss": 0.8961, + "step": 3248, + "task_loss": 0.5754123330116272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5849120020866394, + "epoch": 2.75, + "learning_rate": 3.0382804009177633e-05, + "loss": 0.707, + "step": 3249, + "task_loss": 0.5135920643806458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6178340315818787, + "epoch": 2.75, + "learning_rate": 3.0376766091051807e-05, + "loss": 0.6944, + "step": 3250, + "task_loss": 0.595054566860199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8201622366905212, + "epoch": 2.75, + "learning_rate": 3.0370728172925978e-05, + "loss": 0.7424, + "step": 3251, + "task_loss": 0.5152836441993713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9982415437698364, + "epoch": 2.75, + "learning_rate": 3.036469025480015e-05, + "loss": 0.7612, + "step": 3252, + "task_loss": 1.0326942205429077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2640788555145264, + "epoch": 2.75, + "learning_rate": 3.0358652336674316e-05, + "loss": 0.9939, + "step": 3253, + "task_loss": 1.0172697305679321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0437321662902832, + "epoch": 2.75, + "learning_rate": 3.0352614418548486e-05, + "loss": 0.9208, + "step": 3254, + "task_loss": 1.6995952129364014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4299675226211548, + "epoch": 2.75, + "learning_rate": 3.0346576500422657e-05, + "loss": 0.9805, + "step": 3255, + "task_loss": 0.7143516540527344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5238592624664307, + "epoch": 2.75, + "learning_rate": 3.0340538582296824e-05, + "loss": 0.591, + "step": 3256, + "task_loss": 1.1079025268554688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6339019536972046, + "epoch": 2.75, + "learning_rate": 3.0334500664170994e-05, + "loss": 0.6887, + "step": 3257, + "task_loss": 0.8352885842323303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7218683958053589, + "epoch": 2.75, + "learning_rate": 3.0328462746045165e-05, + "loss": 0.8389, + "step": 3258, + "task_loss": 0.8977619409561157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8854718208312988, + "epoch": 2.75, + "learning_rate": 3.0322424827919332e-05, + "loss": 0.8207, + "step": 3259, + "task_loss": 1.7896380424499512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0721087455749512, + "epoch": 2.76, + "learning_rate": 3.0316386909793503e-05, + "loss": 0.8341, + "step": 3260, + "task_loss": 0.8420252799987793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1111098527908325, + "epoch": 2.76, + "learning_rate": 3.0310348991667677e-05, + "loss": 0.8382, + "step": 3261, + "task_loss": 0.5889455676078796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8799079656600952, + "epoch": 2.76, + "learning_rate": 3.0304311073541847e-05, + "loss": 0.6934, + "step": 3262, + "task_loss": 1.2015488147735596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3919408917427063, + "epoch": 2.76, + "learning_rate": 3.029827315541601e-05, + "loss": 0.6058, + "step": 3263, + "task_loss": 0.7438454627990723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9071213006973267, + "epoch": 2.76, + "learning_rate": 3.0292235237290185e-05, + "loss": 0.7473, + "step": 3264, + "task_loss": 0.7080237865447998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3722788095474243, + "epoch": 2.76, + "learning_rate": 3.0286197319164356e-05, + "loss": 0.7755, + "step": 3265, + "task_loss": 1.4974243640899658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.583774983882904, + "epoch": 2.76, + "learning_rate": 3.0280159401038523e-05, + "loss": 0.7605, + "step": 3266, + "task_loss": 0.15253575146198273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6703147888183594, + "epoch": 2.76, + "learning_rate": 3.0274121482912693e-05, + "loss": 0.756, + "step": 3267, + "task_loss": 0.7222764492034912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4770735502243042, + "epoch": 2.76, + "learning_rate": 3.0268083564786864e-05, + "loss": 0.7505, + "step": 3268, + "task_loss": 0.9355275630950928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2244980335235596, + "epoch": 2.76, + "learning_rate": 3.026204564666103e-05, + "loss": 0.9068, + "step": 3269, + "task_loss": 1.3997679948806763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7069613933563232, + "epoch": 2.76, + "learning_rate": 3.0256007728535202e-05, + "loss": 0.7463, + "step": 3270, + "task_loss": 0.6206645369529724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5903657078742981, + "epoch": 2.76, + "learning_rate": 3.0249969810409372e-05, + "loss": 0.9634, + "step": 3271, + "task_loss": 0.3822365403175354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6834724545478821, + "epoch": 2.77, + "learning_rate": 3.0243931892283546e-05, + "loss": 0.825, + "step": 3272, + "task_loss": 0.8759979009628296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5028154253959656, + "epoch": 2.77, + "learning_rate": 3.023789397415771e-05, + "loss": 0.5826, + "step": 3273, + "task_loss": 1.3599047660827637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8747584223747253, + "epoch": 2.77, + "learning_rate": 3.023185605603188e-05, + "loss": 0.5541, + "step": 3274, + "task_loss": 0.46913453936576843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6725547909736633, + "epoch": 2.77, + "learning_rate": 3.0225818137906055e-05, + "loss": 0.5366, + "step": 3275, + "task_loss": 0.2768903374671936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8722879886627197, + "epoch": 2.77, + "learning_rate": 3.021978021978022e-05, + "loss": 0.654, + "step": 3276, + "task_loss": 0.9155100584030151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8894006013870239, + "epoch": 2.77, + "learning_rate": 3.0213742301654392e-05, + "loss": 0.8126, + "step": 3277, + "task_loss": 0.2742604613304138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5366387367248535, + "epoch": 2.77, + "learning_rate": 3.0207704383528563e-05, + "loss": 0.6444, + "step": 3278, + "task_loss": 0.7328145503997803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.846782922744751, + "epoch": 2.77, + "learning_rate": 3.0201666465402727e-05, + "loss": 0.6579, + "step": 3279, + "task_loss": 1.3426260948181152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8142915964126587, + "epoch": 2.77, + "learning_rate": 3.01956285472769e-05, + "loss": 0.9306, + "step": 3280, + "task_loss": 0.413924902677536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5528120398521423, + "epoch": 2.77, + "learning_rate": 3.018959062915107e-05, + "loss": 0.776, + "step": 3281, + "task_loss": 1.9705495834350586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9322274923324585, + "epoch": 2.77, + "learning_rate": 3.0183552711025242e-05, + "loss": 0.9211, + "step": 3282, + "task_loss": 2.0287837982177734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3485484719276428, + "epoch": 2.77, + "learning_rate": 3.017751479289941e-05, + "loss": 0.7222, + "step": 3283, + "task_loss": 0.14161722362041473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.81611168384552, + "epoch": 2.78, + "learning_rate": 3.017147687477358e-05, + "loss": 0.7918, + "step": 3284, + "task_loss": 1.099387288093567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5183975696563721, + "epoch": 2.78, + "learning_rate": 3.016543895664775e-05, + "loss": 0.9085, + "step": 3285, + "task_loss": 0.8479616045951843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.965560793876648, + "epoch": 2.78, + "learning_rate": 3.0159401038521917e-05, + "loss": 1.0197, + "step": 3286, + "task_loss": 0.7708216905593872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7323737144470215, + "epoch": 2.78, + "learning_rate": 3.0153363120396088e-05, + "loss": 0.7039, + "step": 3287, + "task_loss": 0.989920973777771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4878042936325073, + "epoch": 2.78, + "learning_rate": 3.014732520227026e-05, + "loss": 0.9399, + "step": 3288, + "task_loss": 1.4251518249511719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42194879055023193, + "epoch": 2.78, + "learning_rate": 3.0141287284144426e-05, + "loss": 0.6529, + "step": 3289, + "task_loss": 0.21817432343959808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7053632736206055, + "epoch": 2.78, + "learning_rate": 3.0135249366018596e-05, + "loss": 0.8551, + "step": 3290, + "task_loss": 0.44933223724365234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5540753602981567, + "epoch": 2.78, + "learning_rate": 3.012921144789277e-05, + "loss": 0.8164, + "step": 3291, + "task_loss": 0.3803727328777313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8512284159660339, + "epoch": 2.78, + "learning_rate": 3.012317352976694e-05, + "loss": 0.6673, + "step": 3292, + "task_loss": 0.6789261102676392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7099487781524658, + "epoch": 2.78, + "learning_rate": 3.0117135611641108e-05, + "loss": 0.8345, + "step": 3293, + "task_loss": 0.5557613372802734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5668407082557678, + "epoch": 2.78, + "learning_rate": 3.011109769351528e-05, + "loss": 0.6318, + "step": 3294, + "task_loss": 0.5616931915283203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5764466524124146, + "epoch": 2.78, + "learning_rate": 3.010505977538945e-05, + "loss": 0.8859, + "step": 3295, + "task_loss": 0.9217730760574341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9525061845779419, + "epoch": 2.79, + "learning_rate": 3.0099021857263616e-05, + "loss": 0.7496, + "step": 3296, + "task_loss": 1.0445730686187744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3469659090042114, + "epoch": 2.79, + "learning_rate": 3.0092983939137787e-05, + "loss": 0.7823, + "step": 3297, + "task_loss": 1.539186954498291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7258437871932983, + "epoch": 2.79, + "learning_rate": 3.0086946021011957e-05, + "loss": 0.721, + "step": 3298, + "task_loss": 0.718285322189331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4952031672000885, + "epoch": 2.79, + "learning_rate": 3.0080908102886125e-05, + "loss": 0.7038, + "step": 3299, + "task_loss": 0.27497726678848267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6820345520973206, + "epoch": 2.79, + "learning_rate": 3.0074870184760295e-05, + "loss": 0.8643, + "step": 3300, + "task_loss": 1.3861004114151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6025184392929077, + "epoch": 2.79, + "learning_rate": 3.0068832266634466e-05, + "loss": 0.6824, + "step": 3301, + "task_loss": 0.6789702773094177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9073735475540161, + "epoch": 2.79, + "learning_rate": 3.006279434850864e-05, + "loss": 0.8305, + "step": 3302, + "task_loss": 0.5434929132461548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7894983291625977, + "epoch": 2.79, + "learning_rate": 3.0056756430382803e-05, + "loss": 1.1227, + "step": 3303, + "task_loss": 0.8957256078720093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8467644453048706, + "epoch": 2.79, + "learning_rate": 3.0050718512256974e-05, + "loss": 0.8402, + "step": 3304, + "task_loss": 0.8982765078544617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5099388957023621, + "epoch": 2.79, + "learning_rate": 3.0044680594131148e-05, + "loss": 0.6793, + "step": 3305, + "task_loss": 0.8377856016159058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5668447613716125, + "epoch": 2.79, + "learning_rate": 3.0038642676005312e-05, + "loss": 0.6099, + "step": 3306, + "task_loss": 0.7080490589141846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7160319685935974, + "epoch": 2.79, + "learning_rate": 3.0032604757879486e-05, + "loss": 0.6283, + "step": 3307, + "task_loss": 0.8550854921340942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.603061854839325, + "epoch": 2.8, + "learning_rate": 3.0026566839753656e-05, + "loss": 0.8496, + "step": 3308, + "task_loss": 1.1990206241607666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8270069360733032, + "epoch": 2.8, + "learning_rate": 3.002052892162782e-05, + "loss": 0.9871, + "step": 3309, + "task_loss": 1.043546438217163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.093401312828064, + "epoch": 2.8, + "learning_rate": 3.0014491003501994e-05, + "loss": 0.826, + "step": 3310, + "task_loss": 1.8238162994384766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8919836282730103, + "epoch": 2.8, + "learning_rate": 3.0008453085376165e-05, + "loss": 0.7791, + "step": 3311, + "task_loss": 1.5206384658813477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8345562219619751, + "epoch": 2.8, + "learning_rate": 3.0002415167250335e-05, + "loss": 0.8903, + "step": 3312, + "task_loss": 0.46383753418922424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43804049491882324, + "epoch": 2.8, + "learning_rate": 2.9996377249124502e-05, + "loss": 0.5437, + "step": 3313, + "task_loss": 0.21464987099170685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1369664669036865, + "epoch": 2.8, + "learning_rate": 2.9990339330998673e-05, + "loss": 0.8468, + "step": 3314, + "task_loss": 1.3994513750076294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4454078674316406, + "epoch": 2.8, + "learning_rate": 2.9984301412872844e-05, + "loss": 0.5578, + "step": 3315, + "task_loss": 0.10286688804626465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.838683009147644, + "epoch": 2.8, + "learning_rate": 2.997826349474701e-05, + "loss": 0.7295, + "step": 3316, + "task_loss": 0.7089868783950806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.707889974117279, + "epoch": 2.8, + "learning_rate": 2.997222557662118e-05, + "loss": 0.7253, + "step": 3317, + "task_loss": 0.9120591282844543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43396711349487305, + "epoch": 2.8, + "learning_rate": 2.9966187658495355e-05, + "loss": 0.8796, + "step": 3318, + "task_loss": 0.4949934482574463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4237309694290161, + "epoch": 2.81, + "learning_rate": 2.996014974036952e-05, + "loss": 0.6691, + "step": 3319, + "task_loss": 0.7241352200508118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9272547960281372, + "epoch": 2.81, + "learning_rate": 2.995411182224369e-05, + "loss": 0.7312, + "step": 3320, + "task_loss": 0.7715173959732056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5984746217727661, + "epoch": 2.81, + "learning_rate": 2.9948073904117864e-05, + "loss": 0.6947, + "step": 3321, + "task_loss": 0.8189025521278381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3606223165988922, + "epoch": 2.81, + "learning_rate": 2.9942035985992034e-05, + "loss": 0.7051, + "step": 3322, + "task_loss": 0.08540637791156769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6124386787414551, + "epoch": 2.81, + "learning_rate": 2.99359980678662e-05, + "loss": 0.5368, + "step": 3323, + "task_loss": 0.17735710740089417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6173439025878906, + "epoch": 2.81, + "learning_rate": 2.9929960149740372e-05, + "loss": 0.7134, + "step": 3324, + "task_loss": 0.7662703394889832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.646454930305481, + "epoch": 2.81, + "learning_rate": 2.9923922231614543e-05, + "loss": 0.6548, + "step": 3325, + "task_loss": 0.3659009337425232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7730146646499634, + "epoch": 2.81, + "learning_rate": 2.991788431348871e-05, + "loss": 0.7001, + "step": 3326, + "task_loss": 0.5309557318687439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17139969766139984, + "epoch": 2.81, + "learning_rate": 2.991184639536288e-05, + "loss": 0.5165, + "step": 3327, + "task_loss": 0.4610321521759033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.901475191116333, + "epoch": 2.81, + "learning_rate": 2.990580847723705e-05, + "loss": 0.6257, + "step": 3328, + "task_loss": 0.9280927777290344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3980725407600403, + "epoch": 2.81, + "learning_rate": 2.9899770559111218e-05, + "loss": 0.5709, + "step": 3329, + "task_loss": 1.1071299314498901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.418002724647522, + "epoch": 2.81, + "learning_rate": 2.989373264098539e-05, + "loss": 0.8482, + "step": 3330, + "task_loss": 0.20210659503936768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6358230113983154, + "epoch": 2.82, + "learning_rate": 2.988769472285956e-05, + "loss": 0.9167, + "step": 3331, + "task_loss": 0.745890200138092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9441062211990356, + "epoch": 2.82, + "learning_rate": 2.9881656804733733e-05, + "loss": 0.8165, + "step": 3332, + "task_loss": 0.959979772567749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9455717206001282, + "epoch": 2.82, + "learning_rate": 2.9875618886607897e-05, + "loss": 0.8012, + "step": 3333, + "task_loss": 1.034719467163086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8987544775009155, + "epoch": 2.82, + "learning_rate": 2.986958096848207e-05, + "loss": 0.6793, + "step": 3334, + "task_loss": 0.8512789011001587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5758401155471802, + "epoch": 2.82, + "learning_rate": 2.986354305035624e-05, + "loss": 0.6613, + "step": 3335, + "task_loss": 0.9593351483345032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28677982091903687, + "epoch": 2.82, + "learning_rate": 2.9857505132230405e-05, + "loss": 0.5497, + "step": 3336, + "task_loss": 0.06280370056629181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6193314790725708, + "epoch": 2.82, + "learning_rate": 2.985146721410458e-05, + "loss": 0.8424, + "step": 3337, + "task_loss": 0.4445653557777405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6852444410324097, + "epoch": 2.82, + "learning_rate": 2.984542929597875e-05, + "loss": 1.0175, + "step": 3338, + "task_loss": 0.5066931843757629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5662254691123962, + "epoch": 2.82, + "learning_rate": 2.9839391377852917e-05, + "loss": 0.721, + "step": 3339, + "task_loss": 0.40833961963653564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5589171648025513, + "epoch": 2.82, + "learning_rate": 2.9833353459727088e-05, + "loss": 0.5924, + "step": 3340, + "task_loss": 0.5408655405044556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5563128590583801, + "epoch": 2.82, + "learning_rate": 2.9827315541601258e-05, + "loss": 0.7494, + "step": 3341, + "task_loss": 0.5156218409538269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4187811613082886, + "epoch": 2.82, + "learning_rate": 2.982127762347543e-05, + "loss": 0.8726, + "step": 3342, + "task_loss": 1.3007992506027222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.6975007057189941, + "epoch": 2.83, + "learning_rate": 2.9815239705349596e-05, + "loss": 1.1042, + "step": 3343, + "task_loss": 1.2388087511062622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32393598556518555, + "epoch": 2.83, + "learning_rate": 2.9809201787223766e-05, + "loss": 0.447, + "step": 3344, + "task_loss": 0.4461499750614166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5517587661743164, + "epoch": 2.83, + "learning_rate": 2.9803163869097937e-05, + "loss": 0.709, + "step": 3345, + "task_loss": 0.42245104908943176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8855922222137451, + "epoch": 2.83, + "learning_rate": 2.9797125950972104e-05, + "loss": 0.8355, + "step": 3346, + "task_loss": 0.5716240406036377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47937145829200745, + "epoch": 2.83, + "learning_rate": 2.9791088032846275e-05, + "loss": 0.6628, + "step": 3347, + "task_loss": 0.4947882294654846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.928808331489563, + "epoch": 2.83, + "learning_rate": 2.978505011472045e-05, + "loss": 0.6398, + "step": 3348, + "task_loss": 1.5807663202285767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47245272994041443, + "epoch": 2.83, + "learning_rate": 2.9779012196594612e-05, + "loss": 0.791, + "step": 3349, + "task_loss": 1.283268928527832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6965512037277222, + "epoch": 2.83, + "learning_rate": 2.9772974278468786e-05, + "loss": 0.5812, + "step": 3350, + "task_loss": 1.1770071983337402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6762098073959351, + "epoch": 2.83, + "learning_rate": 2.9766936360342957e-05, + "loss": 0.8083, + "step": 3351, + "task_loss": 0.9953381419181824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7784196138381958, + "epoch": 2.83, + "learning_rate": 2.976089844221712e-05, + "loss": 0.8165, + "step": 3352, + "task_loss": 1.2067924737930298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9041169881820679, + "epoch": 2.83, + "learning_rate": 2.9754860524091295e-05, + "loss": 0.7064, + "step": 3353, + "task_loss": 0.554377555847168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.675803542137146, + "epoch": 2.83, + "learning_rate": 2.9748822605965465e-05, + "loss": 0.7949, + "step": 3354, + "task_loss": 1.5139515399932861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2474185228347778, + "epoch": 2.84, + "learning_rate": 2.9742784687839636e-05, + "loss": 1.051, + "step": 3355, + "task_loss": 1.3728817701339722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7146264910697937, + "epoch": 2.84, + "learning_rate": 2.9736746769713803e-05, + "loss": 0.7898, + "step": 3356, + "task_loss": 0.6647437214851379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7725421786308289, + "epoch": 2.84, + "learning_rate": 2.9730708851587974e-05, + "loss": 0.7098, + "step": 3357, + "task_loss": 0.6905233860015869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6829078197479248, + "epoch": 2.84, + "learning_rate": 2.9724670933462144e-05, + "loss": 0.5362, + "step": 3358, + "task_loss": 0.6515323519706726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5686049461364746, + "epoch": 2.84, + "learning_rate": 2.971863301533631e-05, + "loss": 0.8427, + "step": 3359, + "task_loss": 1.1448637247085571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7078079581260681, + "epoch": 2.84, + "learning_rate": 2.9712595097210482e-05, + "loss": 0.7538, + "step": 3360, + "task_loss": 0.8823916912078857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5136501789093018, + "epoch": 2.84, + "learning_rate": 2.9706557179084653e-05, + "loss": 0.7323, + "step": 3361, + "task_loss": 0.2550985813140869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5257319211959839, + "epoch": 2.84, + "learning_rate": 2.970051926095882e-05, + "loss": 0.6241, + "step": 3362, + "task_loss": 0.5669220685958862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8820339441299438, + "epoch": 2.84, + "learning_rate": 2.969448134283299e-05, + "loss": 0.862, + "step": 3363, + "task_loss": 0.6643784046173096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7887123823165894, + "epoch": 2.84, + "learning_rate": 2.9688443424707164e-05, + "loss": 0.8536, + "step": 3364, + "task_loss": 1.103582501411438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4248363673686981, + "epoch": 2.84, + "learning_rate": 2.9682405506581335e-05, + "loss": 0.6901, + "step": 3365, + "task_loss": 1.1229437589645386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8013144731521606, + "epoch": 2.84, + "learning_rate": 2.9676367588455502e-05, + "loss": 0.5921, + "step": 3366, + "task_loss": 1.3697738647460938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5745146870613098, + "epoch": 2.85, + "learning_rate": 2.9670329670329673e-05, + "loss": 0.8415, + "step": 3367, + "task_loss": 0.9808184504508972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5577234029769897, + "epoch": 2.85, + "learning_rate": 2.9664291752203843e-05, + "loss": 0.7684, + "step": 3368, + "task_loss": 0.9671440124511719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4190555214881897, + "epoch": 2.85, + "learning_rate": 2.965825383407801e-05, + "loss": 0.699, + "step": 3369, + "task_loss": 0.09167198836803436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8989880084991455, + "epoch": 2.85, + "learning_rate": 2.965221591595218e-05, + "loss": 0.8081, + "step": 3370, + "task_loss": 1.1267329454421997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7191668152809143, + "epoch": 2.85, + "learning_rate": 2.964617799782635e-05, + "loss": 0.712, + "step": 3371, + "task_loss": 0.8082057237625122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0991194248199463, + "epoch": 2.85, + "learning_rate": 2.964014007970052e-05, + "loss": 0.9959, + "step": 3372, + "task_loss": 0.6798328757286072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5841532349586487, + "epoch": 2.85, + "learning_rate": 2.963410216157469e-05, + "loss": 0.4368, + "step": 3373, + "task_loss": 0.3852476477622986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4525078237056732, + "epoch": 2.85, + "learning_rate": 2.962806424344886e-05, + "loss": 0.738, + "step": 3374, + "task_loss": 0.6469378471374512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0774712562561035, + "epoch": 2.85, + "learning_rate": 2.9622026325323034e-05, + "loss": 0.695, + "step": 3375, + "task_loss": 0.8054196834564209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8607787489891052, + "epoch": 2.85, + "learning_rate": 2.9615988407197198e-05, + "loss": 0.7308, + "step": 3376, + "task_loss": 1.0702584981918335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9457405805587769, + "epoch": 2.85, + "learning_rate": 2.9609950489071368e-05, + "loss": 0.7783, + "step": 3377, + "task_loss": 0.7950518131256104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5823343396186829, + "epoch": 2.85, + "learning_rate": 2.9603912570945542e-05, + "loss": 0.5022, + "step": 3378, + "task_loss": 0.6022517085075378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6101597547531128, + "epoch": 2.86, + "learning_rate": 2.9597874652819706e-05, + "loss": 0.9097, + "step": 3379, + "task_loss": 1.0370609760284424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.346673846244812, + "epoch": 2.86, + "learning_rate": 2.959183673469388e-05, + "loss": 0.7723, + "step": 3380, + "task_loss": 0.6194068789482117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.477618545293808, + "epoch": 2.86, + "learning_rate": 2.958579881656805e-05, + "loss": 0.6663, + "step": 3381, + "task_loss": 0.24159900844097137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9971652626991272, + "epoch": 2.86, + "learning_rate": 2.9579760898442214e-05, + "loss": 0.8827, + "step": 3382, + "task_loss": 1.0553619861602783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3792429268360138, + "epoch": 2.86, + "learning_rate": 2.9573722980316388e-05, + "loss": 0.7844, + "step": 3383, + "task_loss": 0.3806905150413513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.310664415359497, + "epoch": 2.86, + "learning_rate": 2.956768506219056e-05, + "loss": 0.993, + "step": 3384, + "task_loss": 0.9180289506912231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3871214985847473, + "epoch": 2.86, + "learning_rate": 2.956164714406473e-05, + "loss": 0.8203, + "step": 3385, + "task_loss": 0.7736110687255859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8282737135887146, + "epoch": 2.86, + "learning_rate": 2.9555609225938897e-05, + "loss": 0.9284, + "step": 3386, + "task_loss": 0.7874165177345276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1021589040756226, + "epoch": 2.86, + "learning_rate": 2.9549571307813067e-05, + "loss": 0.7402, + "step": 3387, + "task_loss": 1.7246118783950806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6346914172172546, + "epoch": 2.86, + "learning_rate": 2.9543533389687238e-05, + "loss": 1.0035, + "step": 3388, + "task_loss": 0.7719483971595764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8377894759178162, + "epoch": 2.86, + "learning_rate": 2.9537495471561405e-05, + "loss": 0.5944, + "step": 3389, + "task_loss": 0.6319814324378967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4818529784679413, + "epoch": 2.87, + "learning_rate": 2.9531457553435575e-05, + "loss": 0.6585, + "step": 3390, + "task_loss": 0.8902625441551208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5993454456329346, + "epoch": 2.87, + "learning_rate": 2.952541963530975e-05, + "loss": 0.5803, + "step": 3391, + "task_loss": 0.7087879180908203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7439875602722168, + "epoch": 2.87, + "learning_rate": 2.9519381717183913e-05, + "loss": 0.8167, + "step": 3392, + "task_loss": 0.8190661668777466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8822708129882812, + "epoch": 2.87, + "learning_rate": 2.9513343799058084e-05, + "loss": 0.8505, + "step": 3393, + "task_loss": 0.9617863893508911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8747755885124207, + "epoch": 2.87, + "learning_rate": 2.9507305880932258e-05, + "loss": 0.6069, + "step": 3394, + "task_loss": 0.4685360789299011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4997193217277527, + "epoch": 2.87, + "learning_rate": 2.9501267962806428e-05, + "loss": 0.788, + "step": 3395, + "task_loss": 0.5313251614570618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5267899036407471, + "epoch": 2.87, + "learning_rate": 2.9495230044680595e-05, + "loss": 0.6196, + "step": 3396, + "task_loss": 0.3576836884021759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6000368595123291, + "epoch": 2.87, + "learning_rate": 2.9489192126554766e-05, + "loss": 0.6254, + "step": 3397, + "task_loss": 0.25771230459213257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4994317889213562, + "epoch": 2.87, + "learning_rate": 2.9483154208428937e-05, + "loss": 0.6603, + "step": 3398, + "task_loss": 0.467429518699646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5789288282394409, + "epoch": 2.87, + "learning_rate": 2.9477116290303104e-05, + "loss": 0.8261, + "step": 3399, + "task_loss": 0.9145685434341431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6369884610176086, + "epoch": 2.87, + "learning_rate": 2.9471078372177274e-05, + "loss": 0.716, + "step": 3400, + "task_loss": 0.8176102638244629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5472508668899536, + "epoch": 2.87, + "learning_rate": 2.9465040454051445e-05, + "loss": 0.5094, + "step": 3401, + "task_loss": 0.6450841426849365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8643209338188171, + "epoch": 2.88, + "learning_rate": 2.9459002535925612e-05, + "loss": 0.7853, + "step": 3402, + "task_loss": 0.9234936237335205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6933404803276062, + "epoch": 2.88, + "learning_rate": 2.9452964617799783e-05, + "loss": 0.7104, + "step": 3403, + "task_loss": 0.9083490967750549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7489376068115234, + "epoch": 2.88, + "learning_rate": 2.9446926699673953e-05, + "loss": 0.8399, + "step": 3404, + "task_loss": 1.4088902473449707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6857874393463135, + "epoch": 2.88, + "learning_rate": 2.9440888781548127e-05, + "loss": 0.8249, + "step": 3405, + "task_loss": 0.3899174928665161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5934022665023804, + "epoch": 2.88, + "learning_rate": 2.943485086342229e-05, + "loss": 0.5459, + "step": 3406, + "task_loss": 0.6158190965652466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37101471424102783, + "epoch": 2.88, + "learning_rate": 2.9428812945296465e-05, + "loss": 0.7211, + "step": 3407, + "task_loss": 0.8972424268722534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7002087831497192, + "epoch": 2.88, + "learning_rate": 2.9422775027170636e-05, + "loss": 0.7783, + "step": 3408, + "task_loss": 0.6444404125213623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4548088312149048, + "epoch": 2.88, + "learning_rate": 2.94167371090448e-05, + "loss": 0.5573, + "step": 3409, + "task_loss": 0.43447262048721313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.437225341796875, + "epoch": 2.88, + "learning_rate": 2.9410699190918973e-05, + "loss": 0.5836, + "step": 3410, + "task_loss": 0.48462316393852234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5605751276016235, + "epoch": 2.88, + "learning_rate": 2.9404661272793144e-05, + "loss": 0.6641, + "step": 3411, + "task_loss": 1.3483930826187134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8081914186477661, + "epoch": 2.88, + "learning_rate": 2.939862335466731e-05, + "loss": 0.7875, + "step": 3412, + "task_loss": 0.9629995822906494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6137567758560181, + "epoch": 2.88, + "learning_rate": 2.939258543654148e-05, + "loss": 0.6513, + "step": 3413, + "task_loss": 0.9355418086051941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.480022132396698, + "epoch": 2.89, + "learning_rate": 2.9386547518415652e-05, + "loss": 0.9328, + "step": 3414, + "task_loss": 1.194594144821167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6108629703521729, + "epoch": 2.89, + "learning_rate": 2.9380509600289823e-05, + "loss": 0.8745, + "step": 3415, + "task_loss": 1.4879045486450195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45273736119270325, + "epoch": 2.89, + "learning_rate": 2.937447168216399e-05, + "loss": 0.8364, + "step": 3416, + "task_loss": 0.8298737406730652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7358760237693787, + "epoch": 2.89, + "learning_rate": 2.936843376403816e-05, + "loss": 0.8043, + "step": 3417, + "task_loss": 1.0173017978668213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1834475994110107, + "epoch": 2.89, + "learning_rate": 2.936239584591233e-05, + "loss": 0.9238, + "step": 3418, + "task_loss": 0.9043486714363098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5684958696365356, + "epoch": 2.89, + "learning_rate": 2.9356357927786498e-05, + "loss": 0.5193, + "step": 3419, + "task_loss": 0.7145360708236694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.123645544052124, + "epoch": 2.89, + "learning_rate": 2.935032000966067e-05, + "loss": 0.8302, + "step": 3420, + "task_loss": 1.851700782775879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48065564036369324, + "epoch": 2.89, + "learning_rate": 2.9344282091534843e-05, + "loss": 0.7259, + "step": 3421, + "task_loss": 1.1903222799301147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9967334866523743, + "epoch": 2.89, + "learning_rate": 2.9338244173409007e-05, + "loss": 0.7928, + "step": 3422, + "task_loss": 1.459773302078247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6306959390640259, + "epoch": 2.89, + "learning_rate": 2.933220625528318e-05, + "loss": 0.6586, + "step": 3423, + "task_loss": 0.5843677520751953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.805478572845459, + "epoch": 2.89, + "learning_rate": 2.932616833715735e-05, + "loss": 0.8505, + "step": 3424, + "task_loss": 0.29496854543685913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6788901090621948, + "epoch": 2.89, + "learning_rate": 2.932013041903152e-05, + "loss": 0.6658, + "step": 3425, + "task_loss": 1.5300517082214355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6280191540718079, + "epoch": 2.9, + "learning_rate": 2.931409250090569e-05, + "loss": 0.6482, + "step": 3426, + "task_loss": 1.5265891551971436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.674058198928833, + "epoch": 2.9, + "learning_rate": 2.930805458277986e-05, + "loss": 0.8421, + "step": 3427, + "task_loss": 0.6224533915519714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7258622646331787, + "epoch": 2.9, + "learning_rate": 2.930201666465403e-05, + "loss": 0.7312, + "step": 3428, + "task_loss": 0.6626542210578918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7254615426063538, + "epoch": 2.9, + "learning_rate": 2.9295978746528197e-05, + "loss": 0.5826, + "step": 3429, + "task_loss": 0.6370080709457397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1862752437591553, + "epoch": 2.9, + "learning_rate": 2.9289940828402368e-05, + "loss": 0.8631, + "step": 3430, + "task_loss": 1.3147554397583008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46955108642578125, + "epoch": 2.9, + "learning_rate": 2.928390291027654e-05, + "loss": 0.4554, + "step": 3431, + "task_loss": 0.48791080713272095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9962940216064453, + "epoch": 2.9, + "learning_rate": 2.9277864992150706e-05, + "loss": 0.5759, + "step": 3432, + "task_loss": 1.6304450035095215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5472315549850464, + "epoch": 2.9, + "learning_rate": 2.9271827074024876e-05, + "loss": 0.8826, + "step": 3433, + "task_loss": 0.4080277383327484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7875730991363525, + "epoch": 2.9, + "learning_rate": 2.9265789155899047e-05, + "loss": 0.6279, + "step": 3434, + "task_loss": 0.977450966835022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7464276552200317, + "epoch": 2.9, + "learning_rate": 2.925975123777322e-05, + "loss": 0.8775, + "step": 3435, + "task_loss": 1.0467785596847534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8669809103012085, + "epoch": 2.9, + "learning_rate": 2.9253713319647384e-05, + "loss": 0.6656, + "step": 3436, + "task_loss": 1.655787467956543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5054278373718262, + "epoch": 2.9, + "learning_rate": 2.924767540152156e-05, + "loss": 0.5236, + "step": 3437, + "task_loss": 0.638662576675415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8156613111495972, + "epoch": 2.91, + "learning_rate": 2.924163748339573e-05, + "loss": 0.6833, + "step": 3438, + "task_loss": 0.41401582956314087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.825654923915863, + "epoch": 2.91, + "learning_rate": 2.9235599565269893e-05, + "loss": 0.6207, + "step": 3439, + "task_loss": 1.8947583436965942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3961506485939026, + "epoch": 2.91, + "learning_rate": 2.9229561647144067e-05, + "loss": 0.5925, + "step": 3440, + "task_loss": 0.7312431335449219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7383763790130615, + "epoch": 2.91, + "learning_rate": 2.9223523729018237e-05, + "loss": 1.0785, + "step": 3441, + "task_loss": 1.5294227600097656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5205205678939819, + "epoch": 2.91, + "learning_rate": 2.9217485810892404e-05, + "loss": 0.7443, + "step": 3442, + "task_loss": 0.22918947041034698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7278944253921509, + "epoch": 2.91, + "learning_rate": 2.9211447892766575e-05, + "loss": 0.964, + "step": 3443, + "task_loss": 0.4356386661529541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9423568844795227, + "epoch": 2.91, + "learning_rate": 2.9205409974640746e-05, + "loss": 0.676, + "step": 3444, + "task_loss": 0.5034884214401245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5563369989395142, + "epoch": 2.91, + "learning_rate": 2.9199372056514916e-05, + "loss": 0.8046, + "step": 3445, + "task_loss": 1.4774986505508423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.576687216758728, + "epoch": 2.91, + "learning_rate": 2.9193334138389083e-05, + "loss": 0.6555, + "step": 3446, + "task_loss": 0.343268483877182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0148389339447021, + "epoch": 2.91, + "learning_rate": 2.9187296220263254e-05, + "loss": 0.6587, + "step": 3447, + "task_loss": 1.250695824623108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6466243267059326, + "epoch": 2.91, + "learning_rate": 2.9181258302137428e-05, + "loss": 0.7111, + "step": 3448, + "task_loss": 0.7125418782234192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1003168821334839, + "epoch": 2.91, + "learning_rate": 2.917522038401159e-05, + "loss": 0.8602, + "step": 3449, + "task_loss": 1.1838304996490479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.172728180885315, + "epoch": 2.92, + "learning_rate": 2.9169182465885762e-05, + "loss": 0.7834, + "step": 3450, + "task_loss": 0.709757387638092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7840103507041931, + "epoch": 2.92, + "learning_rate": 2.9163144547759936e-05, + "loss": 0.7143, + "step": 3451, + "task_loss": 1.2206426858901978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7525635957717896, + "epoch": 2.92, + "learning_rate": 2.91571066296341e-05, + "loss": 0.7025, + "step": 3452, + "task_loss": 0.6732720732688904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38421669602394104, + "epoch": 2.92, + "learning_rate": 2.9151068711508274e-05, + "loss": 0.6318, + "step": 3453, + "task_loss": 0.5469582080841064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8543015718460083, + "epoch": 2.92, + "learning_rate": 2.9145030793382445e-05, + "loss": 0.8314, + "step": 3454, + "task_loss": 0.6467356085777283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7180390357971191, + "epoch": 2.92, + "learning_rate": 2.9138992875256615e-05, + "loss": 0.7665, + "step": 3455, + "task_loss": 2.445740222930908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6109917163848877, + "epoch": 2.92, + "learning_rate": 2.9132954957130782e-05, + "loss": 0.7538, + "step": 3456, + "task_loss": 0.9847996234893799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7462831735610962, + "epoch": 2.92, + "learning_rate": 2.9126917039004953e-05, + "loss": 0.7839, + "step": 3457, + "task_loss": 0.698974072933197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8333162069320679, + "epoch": 2.92, + "learning_rate": 2.9120879120879123e-05, + "loss": 0.7337, + "step": 3458, + "task_loss": 1.292702078819275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7255497574806213, + "epoch": 2.92, + "learning_rate": 2.911484120275329e-05, + "loss": 0.6461, + "step": 3459, + "task_loss": 0.5546559691429138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8517965078353882, + "epoch": 2.92, + "learning_rate": 2.910880328462746e-05, + "loss": 0.6971, + "step": 3460, + "task_loss": 0.7008397579193115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4233158230781555, + "epoch": 2.93, + "learning_rate": 2.9102765366501632e-05, + "loss": 0.47, + "step": 3461, + "task_loss": 0.22391372919082642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47498345375061035, + "epoch": 2.93, + "learning_rate": 2.90967274483758e-05, + "loss": 0.5993, + "step": 3462, + "task_loss": 1.7497204542160034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6662006974220276, + "epoch": 2.93, + "learning_rate": 2.909068953024997e-05, + "loss": 0.7446, + "step": 3463, + "task_loss": 1.8353526592254639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9654672145843506, + "epoch": 2.93, + "learning_rate": 2.9084651612124143e-05, + "loss": 0.8181, + "step": 3464, + "task_loss": 0.6161485910415649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.53838050365448, + "epoch": 2.93, + "learning_rate": 2.9078613693998314e-05, + "loss": 0.6078, + "step": 3465, + "task_loss": 0.3643706142902374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.568381667137146, + "epoch": 2.93, + "learning_rate": 2.9072575775872478e-05, + "loss": 0.7643, + "step": 3466, + "task_loss": 0.3740490972995758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5097825527191162, + "epoch": 2.93, + "learning_rate": 2.9066537857746652e-05, + "loss": 0.599, + "step": 3467, + "task_loss": 0.5570055246353149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6624899506568909, + "epoch": 2.93, + "learning_rate": 2.9060499939620822e-05, + "loss": 0.6175, + "step": 3468, + "task_loss": 0.5191022157669067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9154953360557556, + "epoch": 2.93, + "learning_rate": 2.905446202149499e-05, + "loss": 0.7352, + "step": 3469, + "task_loss": 1.5485376119613647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7692798376083374, + "epoch": 2.93, + "learning_rate": 2.904842410336916e-05, + "loss": 0.8144, + "step": 3470, + "task_loss": 1.2589569091796875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7795748710632324, + "epoch": 2.93, + "learning_rate": 2.904238618524333e-05, + "loss": 0.9128, + "step": 3471, + "task_loss": 0.9293898344039917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.489696741104126, + "epoch": 2.93, + "learning_rate": 2.9036348267117498e-05, + "loss": 0.8576, + "step": 3472, + "task_loss": 0.35216429829597473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6539574265480042, + "epoch": 2.94, + "learning_rate": 2.903031034899167e-05, + "loss": 0.638, + "step": 3473, + "task_loss": 0.7694327235221863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7084947228431702, + "epoch": 2.94, + "learning_rate": 2.902427243086584e-05, + "loss": 0.6679, + "step": 3474, + "task_loss": 1.428110957145691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6698046922683716, + "epoch": 2.94, + "learning_rate": 2.901823451274001e-05, + "loss": 0.8171, + "step": 3475, + "task_loss": 0.4634069800376892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6663632392883301, + "epoch": 2.94, + "learning_rate": 2.9012196594614177e-05, + "loss": 0.6838, + "step": 3476, + "task_loss": 0.6905601024627686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7209047079086304, + "epoch": 2.94, + "learning_rate": 2.9006158676488347e-05, + "loss": 0.6253, + "step": 3477, + "task_loss": 0.6825904250144958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47803089022636414, + "epoch": 2.94, + "learning_rate": 2.900012075836252e-05, + "loss": 0.6214, + "step": 3478, + "task_loss": 0.5431719422340393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.798856794834137, + "epoch": 2.94, + "learning_rate": 2.8994082840236685e-05, + "loss": 0.7845, + "step": 3479, + "task_loss": 1.5912142992019653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3649296760559082, + "epoch": 2.94, + "learning_rate": 2.898804492211086e-05, + "loss": 0.8058, + "step": 3480, + "task_loss": 1.0541822910308838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7737277746200562, + "epoch": 2.94, + "learning_rate": 2.898200700398503e-05, + "loss": 0.9209, + "step": 3481, + "task_loss": 1.071914553642273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7200210094451904, + "epoch": 2.94, + "learning_rate": 2.8975969085859193e-05, + "loss": 0.7199, + "step": 3482, + "task_loss": 0.45951420068740845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42172491550445557, + "epoch": 2.94, + "learning_rate": 2.8969931167733367e-05, + "loss": 0.6139, + "step": 3483, + "task_loss": 0.35511523485183716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7546212673187256, + "epoch": 2.94, + "learning_rate": 2.8963893249607538e-05, + "loss": 0.7285, + "step": 3484, + "task_loss": 1.1285014152526855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9378154277801514, + "epoch": 2.95, + "learning_rate": 2.895785533148171e-05, + "loss": 0.7675, + "step": 3485, + "task_loss": 1.2741862535476685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5178618431091309, + "epoch": 2.95, + "learning_rate": 2.8951817413355876e-05, + "loss": 0.8078, + "step": 3486, + "task_loss": 0.6588294506072998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2086021900177002, + "epoch": 2.95, + "learning_rate": 2.8945779495230046e-05, + "loss": 0.8567, + "step": 3487, + "task_loss": 1.3689804077148438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47692954540252686, + "epoch": 2.95, + "learning_rate": 2.8939741577104217e-05, + "loss": 0.6614, + "step": 3488, + "task_loss": 0.659242570400238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9082344174385071, + "epoch": 2.95, + "learning_rate": 2.8933703658978384e-05, + "loss": 0.7667, + "step": 3489, + "task_loss": 1.9606081247329712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5256115198135376, + "epoch": 2.95, + "learning_rate": 2.8927665740852555e-05, + "loss": 0.4659, + "step": 3490, + "task_loss": 0.32639172673225403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5164555907249451, + "epoch": 2.95, + "learning_rate": 2.8921627822726725e-05, + "loss": 0.6752, + "step": 3491, + "task_loss": 0.45916610956192017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.601751446723938, + "epoch": 2.95, + "learning_rate": 2.8915589904600892e-05, + "loss": 0.6905, + "step": 3492, + "task_loss": 0.66357421875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46919217705726624, + "epoch": 2.95, + "learning_rate": 2.8909551986475063e-05, + "loss": 0.5786, + "step": 3493, + "task_loss": 0.2836116850376129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0456271171569824, + "epoch": 2.95, + "learning_rate": 2.8903514068349237e-05, + "loss": 0.9233, + "step": 3494, + "task_loss": 1.2850019931793213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4371858537197113, + "epoch": 2.95, + "learning_rate": 2.8897476150223407e-05, + "loss": 0.53, + "step": 3495, + "task_loss": 0.8951230645179749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.784016489982605, + "epoch": 2.95, + "learning_rate": 2.889143823209757e-05, + "loss": 0.7503, + "step": 3496, + "task_loss": 1.0585284233093262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9110751748085022, + "epoch": 2.96, + "learning_rate": 2.8885400313971745e-05, + "loss": 0.7518, + "step": 3497, + "task_loss": 1.1744385957717896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8005282878875732, + "epoch": 2.96, + "learning_rate": 2.8879362395845916e-05, + "loss": 0.7192, + "step": 3498, + "task_loss": 1.1871466636657715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9272075891494751, + "epoch": 2.96, + "learning_rate": 2.8873324477720083e-05, + "loss": 0.908, + "step": 3499, + "task_loss": 0.5407767295837402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7700062990188599, + "epoch": 2.96, + "learning_rate": 2.8867286559594254e-05, + "loss": 0.6472, + "step": 3500, + "task_loss": 1.0530611276626587 + }, + { + "epoch": 2.96, + "eval_accuracy": 0.8907722772277228, + "eval_loss": 0.43368417024612427, + "eval_runtime": 227.1671, + "eval_samples_per_second": 111.152, + "eval_steps_per_second": 0.872, + "step": 3500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5895242691040039, + "epoch": 2.96, + "learning_rate": 2.8861248641468424e-05, + "loss": 0.5867, + "step": 3501, + "task_loss": 1.4885001182556152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6212030649185181, + "epoch": 2.96, + "learning_rate": 2.885521072334259e-05, + "loss": 0.7024, + "step": 3502, + "task_loss": 0.5673592686653137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7216736078262329, + "epoch": 2.96, + "learning_rate": 2.8849172805216762e-05, + "loss": 0.5715, + "step": 3503, + "task_loss": 0.2849804162979126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8654711842536926, + "epoch": 2.96, + "learning_rate": 2.8843134887090932e-05, + "loss": 0.7444, + "step": 3504, + "task_loss": 1.7764543294906616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35397881269454956, + "epoch": 2.96, + "learning_rate": 2.8837096968965106e-05, + "loss": 0.5267, + "step": 3505, + "task_loss": 1.782389521598816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0136513710021973, + "epoch": 2.96, + "learning_rate": 2.883105905083927e-05, + "loss": 0.6698, + "step": 3506, + "task_loss": 1.268263816833496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7251628637313843, + "epoch": 2.96, + "learning_rate": 2.882502113271344e-05, + "loss": 0.8858, + "step": 3507, + "task_loss": 1.0930500030517578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3958415985107422, + "epoch": 2.96, + "learning_rate": 2.8818983214587615e-05, + "loss": 0.8625, + "step": 3508, + "task_loss": 0.7988516092300415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3297687768936157, + "epoch": 2.97, + "learning_rate": 2.881294529646178e-05, + "loss": 0.9801, + "step": 3509, + "task_loss": 1.4945991039276123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8767632842063904, + "epoch": 2.97, + "learning_rate": 2.8806907378335952e-05, + "loss": 0.7777, + "step": 3510, + "task_loss": 2.163203001022339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6318197250366211, + "epoch": 2.97, + "learning_rate": 2.8800869460210123e-05, + "loss": 1.0891, + "step": 3511, + "task_loss": 1.8774516582489014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6436439752578735, + "epoch": 2.97, + "learning_rate": 2.8794831542084287e-05, + "loss": 0.7685, + "step": 3512, + "task_loss": 0.9692749977111816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6486576199531555, + "epoch": 2.97, + "learning_rate": 2.878879362395846e-05, + "loss": 0.7953, + "step": 3513, + "task_loss": 1.781054139137268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5908812284469604, + "epoch": 2.97, + "learning_rate": 2.878275570583263e-05, + "loss": 0.6084, + "step": 3514, + "task_loss": 0.10436081141233444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6379696726799011, + "epoch": 2.97, + "learning_rate": 2.8776717787706802e-05, + "loss": 0.8485, + "step": 3515, + "task_loss": 1.4927537441253662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8152763843536377, + "epoch": 2.97, + "learning_rate": 2.877067986958097e-05, + "loss": 0.6518, + "step": 3516, + "task_loss": 0.7280385494232178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39982518553733826, + "epoch": 2.97, + "learning_rate": 2.876464195145514e-05, + "loss": 0.6249, + "step": 3517, + "task_loss": 0.5262916684150696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5574002265930176, + "epoch": 2.97, + "learning_rate": 2.875860403332931e-05, + "loss": 0.5957, + "step": 3518, + "task_loss": 0.5144563317298889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7622504234313965, + "epoch": 2.97, + "learning_rate": 2.8752566115203477e-05, + "loss": 0.6903, + "step": 3519, + "task_loss": 0.745737612247467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.472598135471344, + "epoch": 2.97, + "learning_rate": 2.8746528197077648e-05, + "loss": 0.6826, + "step": 3520, + "task_loss": 0.24479055404663086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.291789174079895, + "epoch": 2.98, + "learning_rate": 2.8740490278951822e-05, + "loss": 0.6862, + "step": 3521, + "task_loss": 1.5910961627960205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8099395632743835, + "epoch": 2.98, + "learning_rate": 2.8734452360825986e-05, + "loss": 0.8086, + "step": 3522, + "task_loss": 0.39356908202171326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7453854084014893, + "epoch": 2.98, + "learning_rate": 2.8728414442700156e-05, + "loss": 0.6738, + "step": 3523, + "task_loss": 0.7309139370918274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7587268352508545, + "epoch": 2.98, + "learning_rate": 2.872237652457433e-05, + "loss": 0.682, + "step": 3524, + "task_loss": 0.7517848610877991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0082471370697021, + "epoch": 2.98, + "learning_rate": 2.8716338606448494e-05, + "loss": 0.7788, + "step": 3525, + "task_loss": 0.5595014095306396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9076305627822876, + "epoch": 2.98, + "learning_rate": 2.8710300688322668e-05, + "loss": 0.842, + "step": 3526, + "task_loss": 0.5062964558601379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5072662830352783, + "epoch": 2.98, + "learning_rate": 2.870426277019684e-05, + "loss": 0.6921, + "step": 3527, + "task_loss": 1.0915141105651855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9681079983711243, + "epoch": 2.98, + "learning_rate": 2.869822485207101e-05, + "loss": 0.7579, + "step": 3528, + "task_loss": 0.9980337619781494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.5782496929168701, + "epoch": 2.98, + "learning_rate": 2.8692186933945176e-05, + "loss": 0.7787, + "step": 3529, + "task_loss": 1.2713779211044312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7345553636550903, + "epoch": 2.98, + "learning_rate": 2.8686149015819347e-05, + "loss": 0.7156, + "step": 3530, + "task_loss": 1.9231317043304443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45823144912719727, + "epoch": 2.98, + "learning_rate": 2.8680111097693518e-05, + "loss": 0.7447, + "step": 3531, + "task_loss": 0.2629527449607849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5715450048446655, + "epoch": 2.99, + "learning_rate": 2.8674073179567685e-05, + "loss": 0.6841, + "step": 3532, + "task_loss": 0.4190598428249359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8194421529769897, + "epoch": 2.99, + "learning_rate": 2.8668035261441855e-05, + "loss": 0.7353, + "step": 3533, + "task_loss": 1.7012922763824463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6091118454933167, + "epoch": 2.99, + "learning_rate": 2.8661997343316026e-05, + "loss": 0.702, + "step": 3534, + "task_loss": 0.5715523362159729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.715003252029419, + "epoch": 2.99, + "learning_rate": 2.8655959425190193e-05, + "loss": 0.8046, + "step": 3535, + "task_loss": 0.8012397885322571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5494982004165649, + "epoch": 2.99, + "learning_rate": 2.8649921507064364e-05, + "loss": 0.7831, + "step": 3536, + "task_loss": 0.5300018191337585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5259543657302856, + "epoch": 2.99, + "learning_rate": 2.8643883588938538e-05, + "loss": 0.5291, + "step": 3537, + "task_loss": 1.088890552520752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5186984539031982, + "epoch": 2.99, + "learning_rate": 2.8637845670812708e-05, + "loss": 0.619, + "step": 3538, + "task_loss": 0.8404145240783691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8344303369522095, + "epoch": 2.99, + "learning_rate": 2.8631807752686872e-05, + "loss": 0.6524, + "step": 3539, + "task_loss": 1.8482786417007446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5549348592758179, + "epoch": 2.99, + "learning_rate": 2.8625769834561046e-05, + "loss": 0.7255, + "step": 3540, + "task_loss": 1.7015589475631714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5830883383750916, + "epoch": 2.99, + "learning_rate": 2.8619731916435216e-05, + "loss": 0.5847, + "step": 3541, + "task_loss": 0.8835998177528381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0474164485931396, + "epoch": 2.99, + "learning_rate": 2.8613693998309384e-05, + "loss": 0.8207, + "step": 3542, + "task_loss": 0.7856638431549072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6131393909454346, + "epoch": 2.99, + "learning_rate": 2.8607656080183554e-05, + "loss": 0.6238, + "step": 3543, + "task_loss": 1.630849838256836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46714574098587036, + "epoch": 3.0, + "learning_rate": 2.8601618162057725e-05, + "loss": 0.8634, + "step": 3544, + "task_loss": 0.7398757934570312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41894951462745667, + "epoch": 3.0, + "learning_rate": 2.8595580243931892e-05, + "loss": 0.5324, + "step": 3545, + "task_loss": 0.567123293876648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2912333011627197, + "epoch": 3.0, + "learning_rate": 2.8589542325806063e-05, + "loss": 0.8322, + "step": 3546, + "task_loss": 1.039556860923767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6628650426864624, + "epoch": 3.0, + "learning_rate": 2.8583504407680233e-05, + "loss": 0.6427, + "step": 3547, + "task_loss": 0.7430834174156189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7094494104385376, + "epoch": 3.0, + "learning_rate": 2.8577466489554404e-05, + "loss": 0.5733, + "step": 3548, + "task_loss": 0.2793852984905243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8963229060173035, + "epoch": 3.0, + "learning_rate": 2.857142857142857e-05, + "loss": 0.6604, + "step": 3549, + "task_loss": 0.2541487216949463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6640872955322266, + "epoch": 3.0, + "learning_rate": 2.856539065330274e-05, + "loss": 1.4603, + "step": 3550, + "task_loss": 0.9503651857376099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4315778613090515, + "epoch": 3.0, + "learning_rate": 2.8559352735176915e-05, + "loss": 0.5902, + "step": 3551, + "task_loss": 0.6308757066726685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8534530401229858, + "epoch": 3.0, + "learning_rate": 2.855331481705108e-05, + "loss": 0.63, + "step": 3552, + "task_loss": 0.3486424684524536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7001720666885376, + "epoch": 3.0, + "learning_rate": 2.8547276898925253e-05, + "loss": 0.5578, + "step": 3553, + "task_loss": 0.9340201616287231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8621876835823059, + "epoch": 3.0, + "learning_rate": 2.8541238980799424e-05, + "loss": 0.6507, + "step": 3554, + "task_loss": 0.7576887607574463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4352402091026306, + "epoch": 3.01, + "learning_rate": 2.8535201062673588e-05, + "loss": 0.6523, + "step": 3555, + "task_loss": 0.4721548557281494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5970913171768188, + "epoch": 3.01, + "learning_rate": 2.852916314454776e-05, + "loss": 0.5143, + "step": 3556, + "task_loss": 0.7570653557777405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6047800779342651, + "epoch": 3.01, + "learning_rate": 2.8523125226421932e-05, + "loss": 0.6389, + "step": 3557, + "task_loss": 0.46549686789512634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.55394446849823, + "epoch": 3.01, + "learning_rate": 2.8517087308296103e-05, + "loss": 0.7055, + "step": 3558, + "task_loss": 0.8493452072143555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5711374878883362, + "epoch": 3.01, + "learning_rate": 2.851104939017027e-05, + "loss": 0.9064, + "step": 3559, + "task_loss": 0.6857057809829712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7787516117095947, + "epoch": 3.01, + "learning_rate": 2.850501147204444e-05, + "loss": 0.6255, + "step": 3560, + "task_loss": 1.4883779287338257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5478712320327759, + "epoch": 3.01, + "learning_rate": 2.849897355391861e-05, + "loss": 0.7224, + "step": 3561, + "task_loss": 1.5161023139953613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7135688066482544, + "epoch": 3.01, + "learning_rate": 2.8492935635792778e-05, + "loss": 0.6565, + "step": 3562, + "task_loss": 0.3719140291213989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1531177759170532, + "epoch": 3.01, + "learning_rate": 2.848689771766695e-05, + "loss": 0.8902, + "step": 3563, + "task_loss": 0.7203177213668823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5592091083526611, + "epoch": 3.01, + "learning_rate": 2.848085979954112e-05, + "loss": 0.76, + "step": 3564, + "task_loss": 0.7370966076850891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5997725129127502, + "epoch": 3.01, + "learning_rate": 2.8474821881415286e-05, + "loss": 0.735, + "step": 3565, + "task_loss": 1.2105438709259033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9959639310836792, + "epoch": 3.01, + "learning_rate": 2.8468783963289457e-05, + "loss": 0.616, + "step": 3566, + "task_loss": 0.43924281001091003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6430736780166626, + "epoch": 3.02, + "learning_rate": 2.846274604516363e-05, + "loss": 0.8501, + "step": 3567, + "task_loss": 1.0086430311203003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8296298384666443, + "epoch": 3.02, + "learning_rate": 2.84567081270378e-05, + "loss": 0.7352, + "step": 3568, + "task_loss": 1.684377908706665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5806262493133545, + "epoch": 3.02, + "learning_rate": 2.8450670208911965e-05, + "loss": 0.7075, + "step": 3569, + "task_loss": 0.48733314871788025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1771345138549805, + "epoch": 3.02, + "learning_rate": 2.844463229078614e-05, + "loss": 0.6196, + "step": 3570, + "task_loss": 1.2887576818466187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32745856046676636, + "epoch": 3.02, + "learning_rate": 2.843859437266031e-05, + "loss": 0.5654, + "step": 3571, + "task_loss": 0.4443657100200653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8458269238471985, + "epoch": 3.02, + "learning_rate": 2.8432556454534477e-05, + "loss": 0.837, + "step": 3572, + "task_loss": 0.7712803483009338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2610406875610352, + "epoch": 3.02, + "learning_rate": 2.8426518536408648e-05, + "loss": 0.6469, + "step": 3573, + "task_loss": 1.287882924079895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3581000864505768, + "epoch": 3.02, + "learning_rate": 2.8420480618282818e-05, + "loss": 0.5144, + "step": 3574, + "task_loss": 0.49181362986564636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6357303857803345, + "epoch": 3.02, + "learning_rate": 2.8414442700156985e-05, + "loss": 0.819, + "step": 3575, + "task_loss": 1.0158765316009521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7117151618003845, + "epoch": 3.02, + "learning_rate": 2.8408404782031156e-05, + "loss": 0.5709, + "step": 3576, + "task_loss": 0.8816638588905334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7645248174667358, + "epoch": 3.02, + "learning_rate": 2.8402366863905327e-05, + "loss": 0.6262, + "step": 3577, + "task_loss": 0.3988209068775177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38314932584762573, + "epoch": 3.02, + "learning_rate": 2.83963289457795e-05, + "loss": 0.7823, + "step": 3578, + "task_loss": 0.32058268785476685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6896148920059204, + "epoch": 3.03, + "learning_rate": 2.8390291027653664e-05, + "loss": 0.5591, + "step": 3579, + "task_loss": 0.6103711128234863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4302940368652344, + "epoch": 3.03, + "learning_rate": 2.8384253109527835e-05, + "loss": 0.6298, + "step": 3580, + "task_loss": 0.5452830195426941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.586306095123291, + "epoch": 3.03, + "learning_rate": 2.837821519140201e-05, + "loss": 0.7412, + "step": 3581, + "task_loss": 0.9802173972129822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5140480995178223, + "epoch": 3.03, + "learning_rate": 2.8372177273276173e-05, + "loss": 0.5335, + "step": 3582, + "task_loss": 0.6141536235809326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6005988717079163, + "epoch": 3.03, + "learning_rate": 2.8366139355150347e-05, + "loss": 0.8635, + "step": 3583, + "task_loss": 1.576610803604126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7433828115463257, + "epoch": 3.03, + "learning_rate": 2.8360101437024517e-05, + "loss": 0.5687, + "step": 3584, + "task_loss": 0.8613354563713074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3786167502403259, + "epoch": 3.03, + "learning_rate": 2.835406351889868e-05, + "loss": 0.613, + "step": 3585, + "task_loss": 0.6103795766830444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8002123832702637, + "epoch": 3.03, + "learning_rate": 2.8348025600772855e-05, + "loss": 0.6844, + "step": 3586, + "task_loss": 0.6950814723968506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3226242661476135, + "epoch": 3.03, + "learning_rate": 2.8341987682647025e-05, + "loss": 0.7632, + "step": 3587, + "task_loss": 1.1891282796859741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.679539680480957, + "epoch": 3.03, + "learning_rate": 2.8335949764521196e-05, + "loss": 0.7384, + "step": 3588, + "task_loss": 1.099945306777954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4051525592803955, + "epoch": 3.03, + "learning_rate": 2.8329911846395363e-05, + "loss": 0.5794, + "step": 3589, + "task_loss": 0.39798703789711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0923786163330078, + "epoch": 3.03, + "learning_rate": 2.8323873928269534e-05, + "loss": 0.791, + "step": 3590, + "task_loss": 1.001700520515442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8324576020240784, + "epoch": 3.04, + "learning_rate": 2.8317836010143704e-05, + "loss": 0.7296, + "step": 3591, + "task_loss": 1.239029049873352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7699583172798157, + "epoch": 3.04, + "learning_rate": 2.831179809201787e-05, + "loss": 0.7131, + "step": 3592, + "task_loss": 1.0743050575256348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6002956628799438, + "epoch": 3.04, + "learning_rate": 2.8305760173892042e-05, + "loss": 0.6503, + "step": 3593, + "task_loss": 0.3002236485481262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3624074459075928, + "epoch": 3.04, + "learning_rate": 2.8299722255766216e-05, + "loss": 0.7515, + "step": 3594, + "task_loss": 1.0848666429519653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6987748146057129, + "epoch": 3.04, + "learning_rate": 2.829368433764038e-05, + "loss": 0.6866, + "step": 3595, + "task_loss": 0.7544407248497009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6010640859603882, + "epoch": 3.04, + "learning_rate": 2.828764641951455e-05, + "loss": 0.4936, + "step": 3596, + "task_loss": 0.6365046501159668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8547073006629944, + "epoch": 3.04, + "learning_rate": 2.8281608501388724e-05, + "loss": 0.6264, + "step": 3597, + "task_loss": 1.7877731323242188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5024874210357666, + "epoch": 3.04, + "learning_rate": 2.8275570583262895e-05, + "loss": 0.5506, + "step": 3598, + "task_loss": 0.9955921769142151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7183303833007812, + "epoch": 3.04, + "learning_rate": 2.8269532665137062e-05, + "loss": 0.6703, + "step": 3599, + "task_loss": 1.0533134937286377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.161064624786377, + "epoch": 3.04, + "learning_rate": 2.8263494747011233e-05, + "loss": 0.8192, + "step": 3600, + "task_loss": 0.5414749979972839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.798387348651886, + "epoch": 3.04, + "learning_rate": 2.8257456828885403e-05, + "loss": 0.644, + "step": 3601, + "task_loss": 1.8769289255142212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9048281311988831, + "epoch": 3.04, + "learning_rate": 2.825141891075957e-05, + "loss": 0.9795, + "step": 3602, + "task_loss": 2.0014753341674805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.664453387260437, + "epoch": 3.05, + "learning_rate": 2.824538099263374e-05, + "loss": 0.5806, + "step": 3603, + "task_loss": 0.8743603229522705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9036272764205933, + "epoch": 3.05, + "learning_rate": 2.823934307450791e-05, + "loss": 0.6876, + "step": 3604, + "task_loss": 0.4418850243091583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8576907515525818, + "epoch": 3.05, + "learning_rate": 2.823330515638208e-05, + "loss": 0.5589, + "step": 3605, + "task_loss": 0.3267512023448944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4059901535511017, + "epoch": 3.05, + "learning_rate": 2.822726723825625e-05, + "loss": 0.4068, + "step": 3606, + "task_loss": 0.662968099117279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8640851974487305, + "epoch": 3.05, + "learning_rate": 2.822122932013042e-05, + "loss": 0.5747, + "step": 3607, + "task_loss": 0.3103358745574951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1322784423828125, + "epoch": 3.05, + "learning_rate": 2.8215191402004594e-05, + "loss": 0.8231, + "step": 3608, + "task_loss": 0.7384780049324036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45259132981300354, + "epoch": 3.05, + "learning_rate": 2.8209153483878758e-05, + "loss": 0.6312, + "step": 3609, + "task_loss": 0.5664212107658386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.254497230052948, + "epoch": 3.05, + "learning_rate": 2.820311556575293e-05, + "loss": 0.5668, + "step": 3610, + "task_loss": 0.4529813528060913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6295821666717529, + "epoch": 3.05, + "learning_rate": 2.8197077647627102e-05, + "loss": 0.5766, + "step": 3611, + "task_loss": 0.6987727284431458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49692821502685547, + "epoch": 3.05, + "learning_rate": 2.8191039729501266e-05, + "loss": 0.5519, + "step": 3612, + "task_loss": 0.4025169014930725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30930906534194946, + "epoch": 3.05, + "learning_rate": 2.818500181137544e-05, + "loss": 0.5756, + "step": 3613, + "task_loss": 0.2295258641242981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6490362286567688, + "epoch": 3.05, + "learning_rate": 2.817896389324961e-05, + "loss": 0.622, + "step": 3614, + "task_loss": 0.7238801121711731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30780038237571716, + "epoch": 3.06, + "learning_rate": 2.8172925975123778e-05, + "loss": 0.5441, + "step": 3615, + "task_loss": 0.4116123616695404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.535415530204773, + "epoch": 3.06, + "learning_rate": 2.8166888056997948e-05, + "loss": 0.6106, + "step": 3616, + "task_loss": 0.33095329999923706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7923001050949097, + "epoch": 3.06, + "learning_rate": 2.816085013887212e-05, + "loss": 0.6397, + "step": 3617, + "task_loss": 1.055081844329834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.886846661567688, + "epoch": 3.06, + "learning_rate": 2.815481222074629e-05, + "loss": 0.6598, + "step": 3618, + "task_loss": 0.8735306859016418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4023129343986511, + "epoch": 3.06, + "learning_rate": 2.8148774302620457e-05, + "loss": 0.7718, + "step": 3619, + "task_loss": 0.09780261665582657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3630629777908325, + "epoch": 3.06, + "learning_rate": 2.8142736384494627e-05, + "loss": 0.5759, + "step": 3620, + "task_loss": 0.12758219242095947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8883981704711914, + "epoch": 3.06, + "learning_rate": 2.8136698466368798e-05, + "loss": 0.7524, + "step": 3621, + "task_loss": 0.5994730591773987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5292114019393921, + "epoch": 3.06, + "learning_rate": 2.8130660548242965e-05, + "loss": 0.7045, + "step": 3622, + "task_loss": 0.9696571230888367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6077001094818115, + "epoch": 3.06, + "learning_rate": 2.8124622630117136e-05, + "loss": 0.5493, + "step": 3623, + "task_loss": 0.6632014513015747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5201268196105957, + "epoch": 3.06, + "learning_rate": 2.811858471199131e-05, + "loss": 0.7625, + "step": 3624, + "task_loss": 0.8669519424438477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7835447192192078, + "epoch": 3.06, + "learning_rate": 2.8112546793865473e-05, + "loss": 0.6, + "step": 3625, + "task_loss": 1.0663723945617676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8852217197418213, + "epoch": 3.07, + "learning_rate": 2.8106508875739644e-05, + "loss": 0.6817, + "step": 3626, + "task_loss": 1.1238359212875366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6591657996177673, + "epoch": 3.07, + "learning_rate": 2.8100470957613818e-05, + "loss": 0.6157, + "step": 3627, + "task_loss": 0.3449137806892395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9209223985671997, + "epoch": 3.07, + "learning_rate": 2.809443303948799e-05, + "loss": 0.6752, + "step": 3628, + "task_loss": 0.5252447724342346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3900834619998932, + "epoch": 3.07, + "learning_rate": 2.8088395121362156e-05, + "loss": 0.502, + "step": 3629, + "task_loss": 0.5338437557220459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7916116118431091, + "epoch": 3.07, + "learning_rate": 2.8082357203236326e-05, + "loss": 0.6483, + "step": 3630, + "task_loss": 0.6517319679260254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9732638597488403, + "epoch": 3.07, + "learning_rate": 2.8076319285110497e-05, + "loss": 0.7296, + "step": 3631, + "task_loss": 1.3063803911209106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8414730429649353, + "epoch": 3.07, + "learning_rate": 2.8070281366984664e-05, + "loss": 0.6335, + "step": 3632, + "task_loss": 0.30291834473609924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7128413915634155, + "epoch": 3.07, + "learning_rate": 2.8064243448858834e-05, + "loss": 0.7648, + "step": 3633, + "task_loss": 1.2600635290145874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.767766535282135, + "epoch": 3.07, + "learning_rate": 2.8058205530733005e-05, + "loss": 0.6109, + "step": 3634, + "task_loss": 0.4626900851726532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5590154528617859, + "epoch": 3.07, + "learning_rate": 2.8052167612607172e-05, + "loss": 0.7046, + "step": 3635, + "task_loss": 0.4575844705104828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.658909797668457, + "epoch": 3.07, + "learning_rate": 2.8046129694481343e-05, + "loss": 0.5722, + "step": 3636, + "task_loss": 0.2537018060684204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3340317904949188, + "epoch": 3.07, + "learning_rate": 2.8040091776355513e-05, + "loss": 0.5649, + "step": 3637, + "task_loss": 0.9174350500106812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6891013979911804, + "epoch": 3.08, + "learning_rate": 2.8034053858229687e-05, + "loss": 0.7036, + "step": 3638, + "task_loss": 0.6399855017662048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5071496963500977, + "epoch": 3.08, + "learning_rate": 2.802801594010385e-05, + "loss": 0.569, + "step": 3639, + "task_loss": 0.21990296244621277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8344600796699524, + "epoch": 3.08, + "learning_rate": 2.8021978021978025e-05, + "loss": 0.6986, + "step": 3640, + "task_loss": 1.393924355506897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7259501218795776, + "epoch": 3.08, + "learning_rate": 2.8015940103852196e-05, + "loss": 0.7242, + "step": 3641, + "task_loss": 0.33193981647491455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.67268306016922, + "epoch": 3.08, + "learning_rate": 2.800990218572636e-05, + "loss": 0.6721, + "step": 3642, + "task_loss": 1.0997542142868042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6848774552345276, + "epoch": 3.08, + "learning_rate": 2.8003864267600533e-05, + "loss": 0.6455, + "step": 3643, + "task_loss": 0.5901796221733093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.528223991394043, + "epoch": 3.08, + "learning_rate": 2.7997826349474704e-05, + "loss": 0.5825, + "step": 3644, + "task_loss": 0.5343807935714722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6142597794532776, + "epoch": 3.08, + "learning_rate": 2.799178843134887e-05, + "loss": 0.4828, + "step": 3645, + "task_loss": 0.19231250882148743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4365150034427643, + "epoch": 3.08, + "learning_rate": 2.7985750513223042e-05, + "loss": 0.5505, + "step": 3646, + "task_loss": 0.5565472841262817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6380513906478882, + "epoch": 3.08, + "learning_rate": 2.7979712595097212e-05, + "loss": 0.7049, + "step": 3647, + "task_loss": 0.6846076250076294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3350088894367218, + "epoch": 3.08, + "learning_rate": 2.7973674676971383e-05, + "loss": 0.7559, + "step": 3648, + "task_loss": 0.790398895740509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.721883237361908, + "epoch": 3.08, + "learning_rate": 2.796763675884555e-05, + "loss": 0.6988, + "step": 3649, + "task_loss": 0.4710249900817871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7126713395118713, + "epoch": 3.09, + "learning_rate": 2.796159884071972e-05, + "loss": 0.62, + "step": 3650, + "task_loss": 1.6085669994354248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48656484484672546, + "epoch": 3.09, + "learning_rate": 2.7955560922593895e-05, + "loss": 0.8002, + "step": 3651, + "task_loss": 0.7507634162902832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.118462324142456, + "epoch": 3.09, + "learning_rate": 2.794952300446806e-05, + "loss": 0.9007, + "step": 3652, + "task_loss": 1.2093141078948975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7298305034637451, + "epoch": 3.09, + "learning_rate": 2.794348508634223e-05, + "loss": 0.6903, + "step": 3653, + "task_loss": 0.8094022870063782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0075494050979614, + "epoch": 3.09, + "learning_rate": 2.7937447168216403e-05, + "loss": 0.9123, + "step": 3654, + "task_loss": 1.181075096130371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6745384931564331, + "epoch": 3.09, + "learning_rate": 2.7931409250090567e-05, + "loss": 0.6325, + "step": 3655, + "task_loss": 0.7473085522651672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2099676132202148, + "epoch": 3.09, + "learning_rate": 2.792537133196474e-05, + "loss": 0.7002, + "step": 3656, + "task_loss": 1.2366188764572144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5508182048797607, + "epoch": 3.09, + "learning_rate": 2.791933341383891e-05, + "loss": 0.8052, + "step": 3657, + "task_loss": 0.5048431158065796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7220621109008789, + "epoch": 3.09, + "learning_rate": 2.7913295495713082e-05, + "loss": 0.8288, + "step": 3658, + "task_loss": 0.9081284999847412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.644292414188385, + "epoch": 3.09, + "learning_rate": 2.790725757758725e-05, + "loss": 0.4916, + "step": 3659, + "task_loss": 0.505931556224823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0231115818023682, + "epoch": 3.09, + "learning_rate": 2.790121965946142e-05, + "loss": 0.6745, + "step": 3660, + "task_loss": 0.6946086287498474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8683412671089172, + "epoch": 3.09, + "learning_rate": 2.789518174133559e-05, + "loss": 0.674, + "step": 3661, + "task_loss": 1.1246572732925415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5513156652450562, + "epoch": 3.1, + "learning_rate": 2.7889143823209757e-05, + "loss": 0.6882, + "step": 3662, + "task_loss": 1.1743996143341064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4099823832511902, + "epoch": 3.1, + "learning_rate": 2.7883105905083928e-05, + "loss": 0.6084, + "step": 3663, + "task_loss": 0.9822730422019958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6249244809150696, + "epoch": 3.1, + "learning_rate": 2.78770679869581e-05, + "loss": 0.8708, + "step": 3664, + "task_loss": 1.5089528560638428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8995240926742554, + "epoch": 3.1, + "learning_rate": 2.7871030068832266e-05, + "loss": 0.7189, + "step": 3665, + "task_loss": 0.6115638613700867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7424576282501221, + "epoch": 3.1, + "learning_rate": 2.7864992150706436e-05, + "loss": 0.5726, + "step": 3666, + "task_loss": 0.438534140586853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5751792192459106, + "epoch": 3.1, + "learning_rate": 2.785895423258061e-05, + "loss": 0.5997, + "step": 3667, + "task_loss": 0.40415582060813904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3761516213417053, + "epoch": 3.1, + "learning_rate": 2.785291631445478e-05, + "loss": 0.6507, + "step": 3668, + "task_loss": 0.13075126707553864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5189539790153503, + "epoch": 3.1, + "learning_rate": 2.7846878396328945e-05, + "loss": 0.5213, + "step": 3669, + "task_loss": 0.2792453467845917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4724988341331482, + "epoch": 3.1, + "learning_rate": 2.784084047820312e-05, + "loss": 0.5688, + "step": 3670, + "task_loss": 0.7488003969192505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4064655303955078, + "epoch": 3.1, + "learning_rate": 2.783480256007729e-05, + "loss": 0.509, + "step": 3671, + "task_loss": 0.7058221697807312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6107677221298218, + "epoch": 3.1, + "learning_rate": 2.7828764641951456e-05, + "loss": 0.4979, + "step": 3672, + "task_loss": 0.29844528436660767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1922931671142578, + "epoch": 3.1, + "learning_rate": 2.7822726723825627e-05, + "loss": 0.8774, + "step": 3673, + "task_loss": 0.5935751795768738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2958366870880127, + "epoch": 3.11, + "learning_rate": 2.7816688805699797e-05, + "loss": 0.6235, + "step": 3674, + "task_loss": 0.2845328152179718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46887263655662537, + "epoch": 3.11, + "learning_rate": 2.7810650887573965e-05, + "loss": 0.7092, + "step": 3675, + "task_loss": 0.3006347715854645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5558429956436157, + "epoch": 3.11, + "learning_rate": 2.7804612969448135e-05, + "loss": 0.6714, + "step": 3676, + "task_loss": 0.25843146443367004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6814650297164917, + "epoch": 3.11, + "learning_rate": 2.7798575051322306e-05, + "loss": 0.6944, + "step": 3677, + "task_loss": 0.3054063022136688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6725668907165527, + "epoch": 3.11, + "learning_rate": 2.7792537133196476e-05, + "loss": 0.7572, + "step": 3678, + "task_loss": 1.0969210863113403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5995408296585083, + "epoch": 3.11, + "learning_rate": 2.7786499215070643e-05, + "loss": 0.7603, + "step": 3679, + "task_loss": 1.122049331665039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8708053827285767, + "epoch": 3.11, + "learning_rate": 2.7780461296944814e-05, + "loss": 0.699, + "step": 3680, + "task_loss": 1.114810585975647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35624855756759644, + "epoch": 3.11, + "learning_rate": 2.7774423378818988e-05, + "loss": 0.495, + "step": 3681, + "task_loss": 0.22323492169380188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5000277757644653, + "epoch": 3.11, + "learning_rate": 2.7768385460693152e-05, + "loss": 0.6666, + "step": 3682, + "task_loss": 0.691615104675293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8480166792869568, + "epoch": 3.11, + "learning_rate": 2.7762347542567326e-05, + "loss": 0.921, + "step": 3683, + "task_loss": 1.2991292476654053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5013954639434814, + "epoch": 3.11, + "learning_rate": 2.7756309624441496e-05, + "loss": 0.5126, + "step": 3684, + "task_loss": 0.31024083495140076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3420479893684387, + "epoch": 3.11, + "learning_rate": 2.775027170631566e-05, + "loss": 0.5629, + "step": 3685, + "task_loss": 0.7034569978713989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6461314558982849, + "epoch": 3.12, + "learning_rate": 2.7744233788189834e-05, + "loss": 0.683, + "step": 3686, + "task_loss": 0.5709354877471924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.824910581111908, + "epoch": 3.12, + "learning_rate": 2.7738195870064005e-05, + "loss": 0.6299, + "step": 3687, + "task_loss": 0.38113948702812195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6973512172698975, + "epoch": 3.12, + "learning_rate": 2.7732157951938175e-05, + "loss": 0.5573, + "step": 3688, + "task_loss": 0.15177257359027863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.992124080657959, + "epoch": 3.12, + "learning_rate": 2.7726120033812342e-05, + "loss": 0.7601, + "step": 3689, + "task_loss": 0.9330900311470032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.399638295173645, + "epoch": 3.12, + "learning_rate": 2.7720082115686513e-05, + "loss": 0.6703, + "step": 3690, + "task_loss": 1.2728360891342163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5215007662773132, + "epoch": 3.12, + "learning_rate": 2.7714044197560684e-05, + "loss": 0.5585, + "step": 3691, + "task_loss": 0.5350586175918579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5828335881233215, + "epoch": 3.12, + "learning_rate": 2.770800627943485e-05, + "loss": 0.3995, + "step": 3692, + "task_loss": 0.594528317451477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8291279077529907, + "epoch": 3.12, + "learning_rate": 2.770196836130902e-05, + "loss": 0.6702, + "step": 3693, + "task_loss": 0.9862347841262817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23790481686592102, + "epoch": 3.12, + "learning_rate": 2.7695930443183192e-05, + "loss": 0.5624, + "step": 3694, + "task_loss": 0.03650989755988121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5780038237571716, + "epoch": 3.12, + "learning_rate": 2.768989252505736e-05, + "loss": 0.6121, + "step": 3695, + "task_loss": 1.1103105545043945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9621471166610718, + "epoch": 3.12, + "learning_rate": 2.768385460693153e-05, + "loss": 0.75, + "step": 3696, + "task_loss": 1.7560906410217285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9815957546234131, + "epoch": 3.13, + "learning_rate": 2.7677816688805704e-05, + "loss": 0.7821, + "step": 3697, + "task_loss": 2.156797170639038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.955795407295227, + "epoch": 3.13, + "learning_rate": 2.7671778770679874e-05, + "loss": 0.7939, + "step": 3698, + "task_loss": 1.3579325675964355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5660983324050903, + "epoch": 3.13, + "learning_rate": 2.7665740852554038e-05, + "loss": 0.6988, + "step": 3699, + "task_loss": 0.5261045098304749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48553627729415894, + "epoch": 3.13, + "learning_rate": 2.7659702934428212e-05, + "loss": 0.7183, + "step": 3700, + "task_loss": 0.7135216593742371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7787368297576904, + "epoch": 3.13, + "learning_rate": 2.7653665016302382e-05, + "loss": 0.6165, + "step": 3701, + "task_loss": 0.4789673686027527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3822312355041504, + "epoch": 3.13, + "learning_rate": 2.764762709817655e-05, + "loss": 0.6976, + "step": 3702, + "task_loss": 0.3869742751121521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3307849168777466, + "epoch": 3.13, + "learning_rate": 2.764158918005072e-05, + "loss": 0.7157, + "step": 3703, + "task_loss": 0.2516452670097351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5310413241386414, + "epoch": 3.13, + "learning_rate": 2.763555126192489e-05, + "loss": 0.6498, + "step": 3704, + "task_loss": 0.5294978022575378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8055798411369324, + "epoch": 3.13, + "learning_rate": 2.7629513343799058e-05, + "loss": 0.7803, + "step": 3705, + "task_loss": 1.8335318565368652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.962742269039154, + "epoch": 3.13, + "learning_rate": 2.762347542567323e-05, + "loss": 0.7151, + "step": 3706, + "task_loss": 1.055146336555481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7460930943489075, + "epoch": 3.13, + "learning_rate": 2.76174375075474e-05, + "loss": 0.7352, + "step": 3707, + "task_loss": 0.8432438373565674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.502670407295227, + "epoch": 3.13, + "learning_rate": 2.7611399589421566e-05, + "loss": 0.5972, + "step": 3708, + "task_loss": 0.1334424614906311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1671204566955566, + "epoch": 3.14, + "learning_rate": 2.7605361671295737e-05, + "loss": 0.917, + "step": 3709, + "task_loss": 1.006469488143921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4803660809993744, + "epoch": 3.14, + "learning_rate": 2.7599323753169907e-05, + "loss": 0.62, + "step": 3710, + "task_loss": 0.9983674883842468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7216548919677734, + "epoch": 3.14, + "learning_rate": 2.759328583504408e-05, + "loss": 0.6371, + "step": 3711, + "task_loss": 2.0298728942871094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41495662927627563, + "epoch": 3.14, + "learning_rate": 2.7587247916918245e-05, + "loss": 0.6291, + "step": 3712, + "task_loss": 0.4697630703449249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.446172833442688, + "epoch": 3.14, + "learning_rate": 2.758120999879242e-05, + "loss": 0.6026, + "step": 3713, + "task_loss": 0.8716745972633362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7951304912567139, + "epoch": 3.14, + "learning_rate": 2.757517208066659e-05, + "loss": 0.5652, + "step": 3714, + "task_loss": 0.303600937128067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5242997407913208, + "epoch": 3.14, + "learning_rate": 2.7569134162540754e-05, + "loss": 0.6211, + "step": 3715, + "task_loss": 0.19346529245376587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8720619678497314, + "epoch": 3.14, + "learning_rate": 2.7563096244414927e-05, + "loss": 0.6253, + "step": 3716, + "task_loss": 0.37648487091064453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7255346179008484, + "epoch": 3.14, + "learning_rate": 2.7557058326289098e-05, + "loss": 0.7578, + "step": 3717, + "task_loss": 1.1927930116653442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2192246913909912, + "epoch": 3.14, + "learning_rate": 2.7551020408163265e-05, + "loss": 0.7528, + "step": 3718, + "task_loss": 1.312962293624878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39314278960227966, + "epoch": 3.14, + "learning_rate": 2.7544982490037436e-05, + "loss": 0.5693, + "step": 3719, + "task_loss": 0.6926872134208679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.443636417388916, + "epoch": 3.14, + "learning_rate": 2.7538944571911606e-05, + "loss": 0.525, + "step": 3720, + "task_loss": 0.49242866039276123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7972447276115417, + "epoch": 3.15, + "learning_rate": 2.7532906653785777e-05, + "loss": 0.6394, + "step": 3721, + "task_loss": 1.440778374671936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5316053628921509, + "epoch": 3.15, + "learning_rate": 2.7526868735659944e-05, + "loss": 0.7658, + "step": 3722, + "task_loss": 1.345367670059204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8069190382957458, + "epoch": 3.15, + "learning_rate": 2.7520830817534115e-05, + "loss": 0.5788, + "step": 3723, + "task_loss": 0.955371618270874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3065193891525269, + "epoch": 3.15, + "learning_rate": 2.751479289940829e-05, + "loss": 0.8439, + "step": 3724, + "task_loss": 0.4560190737247467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6988447308540344, + "epoch": 3.15, + "learning_rate": 2.7508754981282452e-05, + "loss": 0.7659, + "step": 3725, + "task_loss": 0.4240686297416687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8805952072143555, + "epoch": 3.15, + "learning_rate": 2.7502717063156623e-05, + "loss": 0.7504, + "step": 3726, + "task_loss": 1.2965900897979736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49975308775901794, + "epoch": 3.15, + "learning_rate": 2.7496679145030797e-05, + "loss": 0.5595, + "step": 3727, + "task_loss": 0.5324035286903381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6466822028160095, + "epoch": 3.15, + "learning_rate": 2.749064122690496e-05, + "loss": 0.7106, + "step": 3728, + "task_loss": 0.5854632258415222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40485233068466187, + "epoch": 3.15, + "learning_rate": 2.7484603308779135e-05, + "loss": 0.583, + "step": 3729, + "task_loss": 0.2413056492805481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.354900598526001, + "epoch": 3.15, + "learning_rate": 2.7478565390653305e-05, + "loss": 0.5161, + "step": 3730, + "task_loss": 0.8037174344062805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7486984729766846, + "epoch": 3.15, + "learning_rate": 2.7472527472527476e-05, + "loss": 0.6241, + "step": 3731, + "task_loss": 0.6723860502243042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4793328642845154, + "epoch": 3.15, + "learning_rate": 2.7466489554401643e-05, + "loss": 0.683, + "step": 3732, + "task_loss": 0.4852195978164673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1061885356903076, + "epoch": 3.16, + "learning_rate": 2.7460451636275814e-05, + "loss": 0.7154, + "step": 3733, + "task_loss": 0.9783468246459961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42131149768829346, + "epoch": 3.16, + "learning_rate": 2.7454413718149984e-05, + "loss": 0.5399, + "step": 3734, + "task_loss": 0.7833398580551147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7367215156555176, + "epoch": 3.16, + "learning_rate": 2.744837580002415e-05, + "loss": 0.5475, + "step": 3735, + "task_loss": 1.0372810363769531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.408748060464859, + "epoch": 3.16, + "learning_rate": 2.7442337881898322e-05, + "loss": 0.5638, + "step": 3736, + "task_loss": 1.0570927858352661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6918579339981079, + "epoch": 3.16, + "learning_rate": 2.7436299963772493e-05, + "loss": 0.6966, + "step": 3737, + "task_loss": 0.40757429599761963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7135032415390015, + "epoch": 3.16, + "learning_rate": 2.743026204564666e-05, + "loss": 0.6963, + "step": 3738, + "task_loss": 1.2679206132888794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27400535345077515, + "epoch": 3.16, + "learning_rate": 2.742422412752083e-05, + "loss": 0.6275, + "step": 3739, + "task_loss": 0.19586075842380524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4530479609966278, + "epoch": 3.16, + "learning_rate": 2.7418186209395004e-05, + "loss": 0.6745, + "step": 3740, + "task_loss": 1.4297794103622437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7493981122970581, + "epoch": 3.16, + "learning_rate": 2.7412148291269175e-05, + "loss": 0.6999, + "step": 3741, + "task_loss": 0.7350195646286011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7677335739135742, + "epoch": 3.16, + "learning_rate": 2.740611037314334e-05, + "loss": 0.8136, + "step": 3742, + "task_loss": 1.293740153312683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47273939847946167, + "epoch": 3.16, + "learning_rate": 2.7400072455017513e-05, + "loss": 0.6162, + "step": 3743, + "task_loss": 0.953671395778656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6389205455780029, + "epoch": 3.16, + "learning_rate": 2.7394034536891683e-05, + "loss": 0.6417, + "step": 3744, + "task_loss": 1.13491690158844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0024096965789795, + "epoch": 3.17, + "learning_rate": 2.738799661876585e-05, + "loss": 0.7572, + "step": 3745, + "task_loss": 0.8565533757209778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6544901132583618, + "epoch": 3.17, + "learning_rate": 2.738195870064002e-05, + "loss": 0.7463, + "step": 3746, + "task_loss": 0.6628578305244446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8597802519798279, + "epoch": 3.17, + "learning_rate": 2.737592078251419e-05, + "loss": 0.7201, + "step": 3747, + "task_loss": 0.8674101829528809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36789387464523315, + "epoch": 3.17, + "learning_rate": 2.736988286438836e-05, + "loss": 0.6084, + "step": 3748, + "task_loss": 1.1345446109771729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7231370210647583, + "epoch": 3.17, + "learning_rate": 2.736384494626253e-05, + "loss": 0.8606, + "step": 3749, + "task_loss": 0.6843928098678589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5862494111061096, + "epoch": 3.17, + "learning_rate": 2.73578070281367e-05, + "loss": 0.5978, + "step": 3750, + "task_loss": 0.4334567189216614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38852691650390625, + "epoch": 3.17, + "learning_rate": 2.735176911001087e-05, + "loss": 0.5267, + "step": 3751, + "task_loss": 0.3833361566066742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36267077922821045, + "epoch": 3.17, + "learning_rate": 2.7345731191885038e-05, + "loss": 0.5862, + "step": 3752, + "task_loss": 0.17595958709716797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47033047676086426, + "epoch": 3.17, + "learning_rate": 2.7339693273759208e-05, + "loss": 0.6806, + "step": 3753, + "task_loss": 0.7861241102218628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3805367350578308, + "epoch": 3.17, + "learning_rate": 2.7333655355633382e-05, + "loss": 0.6022, + "step": 3754, + "task_loss": 1.2618480920791626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5910431146621704, + "epoch": 3.17, + "learning_rate": 2.7327617437507546e-05, + "loss": 0.6079, + "step": 3755, + "task_loss": 1.083288311958313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3335961699485779, + "epoch": 3.17, + "learning_rate": 2.7321579519381716e-05, + "loss": 0.4733, + "step": 3756, + "task_loss": 0.08060088753700256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6208626627922058, + "epoch": 3.18, + "learning_rate": 2.731554160125589e-05, + "loss": 0.7472, + "step": 3757, + "task_loss": 0.7524503469467163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.548250675201416, + "epoch": 3.18, + "learning_rate": 2.7309503683130054e-05, + "loss": 0.4952, + "step": 3758, + "task_loss": 0.8591022491455078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.558382511138916, + "epoch": 3.18, + "learning_rate": 2.7303465765004228e-05, + "loss": 0.6555, + "step": 3759, + "task_loss": 1.2326316833496094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0760020017623901, + "epoch": 3.18, + "learning_rate": 2.72974278468784e-05, + "loss": 0.7811, + "step": 3760, + "task_loss": 0.47576460242271423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6894491910934448, + "epoch": 3.18, + "learning_rate": 2.729138992875257e-05, + "loss": 0.7841, + "step": 3761, + "task_loss": 0.5184053182601929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49788907170295715, + "epoch": 3.18, + "learning_rate": 2.7285352010626736e-05, + "loss": 0.5524, + "step": 3762, + "task_loss": 0.5793148875236511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6055487394332886, + "epoch": 3.18, + "learning_rate": 2.7279314092500907e-05, + "loss": 0.5082, + "step": 3763, + "task_loss": 0.40051424503326416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5558630228042603, + "epoch": 3.18, + "learning_rate": 2.7273276174375078e-05, + "loss": 0.6046, + "step": 3764, + "task_loss": 1.1657373905181885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4367152154445648, + "epoch": 3.18, + "learning_rate": 2.7267238256249245e-05, + "loss": 0.7325, + "step": 3765, + "task_loss": 0.3441525995731354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6355109214782715, + "epoch": 3.18, + "learning_rate": 2.7261200338123415e-05, + "loss": 0.6441, + "step": 3766, + "task_loss": 0.945931077003479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.860523521900177, + "epoch": 3.18, + "learning_rate": 2.7255162419997586e-05, + "loss": 0.6012, + "step": 3767, + "task_loss": 1.1297026872634888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7198249697685242, + "epoch": 3.19, + "learning_rate": 2.7249124501871753e-05, + "loss": 0.5669, + "step": 3768, + "task_loss": 1.3856337070465088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9888213872909546, + "epoch": 3.19, + "learning_rate": 2.7243086583745924e-05, + "loss": 0.721, + "step": 3769, + "task_loss": 1.0374499559402466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5991783738136292, + "epoch": 3.19, + "learning_rate": 2.7237048665620098e-05, + "loss": 0.7023, + "step": 3770, + "task_loss": 0.48877325654029846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9571166038513184, + "epoch": 3.19, + "learning_rate": 2.7231010747494268e-05, + "loss": 0.8814, + "step": 3771, + "task_loss": 1.5043777227401733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36226242780685425, + "epoch": 3.19, + "learning_rate": 2.7224972829368432e-05, + "loss": 0.5564, + "step": 3772, + "task_loss": 0.6930634379386902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5540496706962585, + "epoch": 3.19, + "learning_rate": 2.7218934911242606e-05, + "loss": 0.6657, + "step": 3773, + "task_loss": 1.5520204305648804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3817112147808075, + "epoch": 3.19, + "learning_rate": 2.7212896993116777e-05, + "loss": 0.5759, + "step": 3774, + "task_loss": 0.9813417196273804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.816502571105957, + "epoch": 3.19, + "learning_rate": 2.7206859074990944e-05, + "loss": 0.8392, + "step": 3775, + "task_loss": 1.1619873046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3911612629890442, + "epoch": 3.19, + "learning_rate": 2.7200821156865114e-05, + "loss": 0.6507, + "step": 3776, + "task_loss": 0.624571681022644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.564736545085907, + "epoch": 3.19, + "learning_rate": 2.7194783238739285e-05, + "loss": 0.5966, + "step": 3777, + "task_loss": 0.5943394899368286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7511274814605713, + "epoch": 3.19, + "learning_rate": 2.7188745320613452e-05, + "loss": 0.6949, + "step": 3778, + "task_loss": 0.5647209882736206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4479868710041046, + "epoch": 3.19, + "learning_rate": 2.7182707402487623e-05, + "loss": 0.5585, + "step": 3779, + "task_loss": 0.3722969591617584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5413883924484253, + "epoch": 3.2, + "learning_rate": 2.7176669484361793e-05, + "loss": 0.6911, + "step": 3780, + "task_loss": 1.143428087234497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5554563999176025, + "epoch": 3.2, + "learning_rate": 2.7170631566235967e-05, + "loss": 0.6162, + "step": 3781, + "task_loss": 0.4169100224971771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8574235439300537, + "epoch": 3.2, + "learning_rate": 2.716459364811013e-05, + "loss": 0.7371, + "step": 3782, + "task_loss": 1.971543788909912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7909473180770874, + "epoch": 3.2, + "learning_rate": 2.71585557299843e-05, + "loss": 0.7167, + "step": 3783, + "task_loss": 0.8002767562866211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5420628190040588, + "epoch": 3.2, + "learning_rate": 2.7152517811858476e-05, + "loss": 0.6198, + "step": 3784, + "task_loss": 0.4915619492530823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9211279153823853, + "epoch": 3.2, + "learning_rate": 2.714647989373264e-05, + "loss": 0.5692, + "step": 3785, + "task_loss": 1.0802326202392578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5529296398162842, + "epoch": 3.2, + "learning_rate": 2.7140441975606813e-05, + "loss": 0.5217, + "step": 3786, + "task_loss": 0.28511202335357666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4289717972278595, + "epoch": 3.2, + "learning_rate": 2.7134404057480984e-05, + "loss": 0.4296, + "step": 3787, + "task_loss": 0.6858624219894409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.385250449180603, + "epoch": 3.2, + "learning_rate": 2.7128366139355148e-05, + "loss": 0.6946, + "step": 3788, + "task_loss": 0.7283570766448975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5088664293289185, + "epoch": 3.2, + "learning_rate": 2.712232822122932e-05, + "loss": 0.6174, + "step": 3789, + "task_loss": 0.7653895020484924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0906836986541748, + "epoch": 3.2, + "learning_rate": 2.7116290303103492e-05, + "loss": 0.7458, + "step": 3790, + "task_loss": 0.7550229430198669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5201867818832397, + "epoch": 3.2, + "learning_rate": 2.7110252384977663e-05, + "loss": 0.5505, + "step": 3791, + "task_loss": 0.5186569690704346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5407899022102356, + "epoch": 3.21, + "learning_rate": 2.710421446685183e-05, + "loss": 0.5613, + "step": 3792, + "task_loss": 0.4091385304927826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7789901494979858, + "epoch": 3.21, + "learning_rate": 2.7098176548726e-05, + "loss": 0.6923, + "step": 3793, + "task_loss": 1.0979026556015015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.549785852432251, + "epoch": 3.21, + "learning_rate": 2.709213863060017e-05, + "loss": 0.6122, + "step": 3794, + "task_loss": 0.22662149369716644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5956298112869263, + "epoch": 3.21, + "learning_rate": 2.7086100712474338e-05, + "loss": 0.5924, + "step": 3795, + "task_loss": 1.675180435180664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4998801052570343, + "epoch": 3.21, + "learning_rate": 2.708006279434851e-05, + "loss": 0.5475, + "step": 3796, + "task_loss": 0.9572819471359253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6800335645675659, + "epoch": 3.21, + "learning_rate": 2.7074024876222683e-05, + "loss": 0.591, + "step": 3797, + "task_loss": 0.7212218046188354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27396222949028015, + "epoch": 3.21, + "learning_rate": 2.7067986958096847e-05, + "loss": 0.5571, + "step": 3798, + "task_loss": 0.26156508922576904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7810419797897339, + "epoch": 3.21, + "learning_rate": 2.7061949039971017e-05, + "loss": 0.8228, + "step": 3799, + "task_loss": 0.45833438634872437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7736960649490356, + "epoch": 3.21, + "learning_rate": 2.705591112184519e-05, + "loss": 0.6531, + "step": 3800, + "task_loss": 0.587170422077179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6723458766937256, + "epoch": 3.21, + "learning_rate": 2.704987320371936e-05, + "loss": 0.7676, + "step": 3801, + "task_loss": 0.5839070677757263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4362712502479553, + "epoch": 3.21, + "learning_rate": 2.704383528559353e-05, + "loss": 0.7637, + "step": 3802, + "task_loss": 0.647459864616394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5533900260925293, + "epoch": 3.21, + "learning_rate": 2.70377973674677e-05, + "loss": 0.6052, + "step": 3803, + "task_loss": 0.6986656188964844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4195314347743988, + "epoch": 3.22, + "learning_rate": 2.703175944934187e-05, + "loss": 0.7278, + "step": 3804, + "task_loss": 1.187132716178894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.763897180557251, + "epoch": 3.22, + "learning_rate": 2.7025721531216037e-05, + "loss": 0.7961, + "step": 3805, + "task_loss": 0.42536187171936035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5210390686988831, + "epoch": 3.22, + "learning_rate": 2.7019683613090208e-05, + "loss": 0.5256, + "step": 3806, + "task_loss": 0.4214737117290497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2576864957809448, + "epoch": 3.22, + "learning_rate": 2.701364569496438e-05, + "loss": 0.5541, + "step": 3807, + "task_loss": 0.6714580655097961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4546336233615875, + "epoch": 3.22, + "learning_rate": 2.7007607776838545e-05, + "loss": 0.6218, + "step": 3808, + "task_loss": 0.6319274306297302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.040923833847046, + "epoch": 3.22, + "learning_rate": 2.7001569858712716e-05, + "loss": 0.9007, + "step": 3809, + "task_loss": 0.5754467248916626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5205569863319397, + "epoch": 3.22, + "learning_rate": 2.6995531940586887e-05, + "loss": 0.5994, + "step": 3810, + "task_loss": 0.7320914268493652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7630859613418579, + "epoch": 3.22, + "learning_rate": 2.698949402246106e-05, + "loss": 0.5938, + "step": 3811, + "task_loss": 1.2972216606140137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27662238478660583, + "epoch": 3.22, + "learning_rate": 2.6983456104335224e-05, + "loss": 0.5458, + "step": 3812, + "task_loss": 0.24180002510547638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4288104772567749, + "epoch": 3.22, + "learning_rate": 2.6977418186209395e-05, + "loss": 0.5961, + "step": 3813, + "task_loss": 0.29090648889541626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1962366104125977, + "epoch": 3.22, + "learning_rate": 2.697138026808357e-05, + "loss": 0.6706, + "step": 3814, + "task_loss": 1.3018921613693237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6922852396965027, + "epoch": 3.22, + "learning_rate": 2.6965342349957733e-05, + "loss": 0.7684, + "step": 3815, + "task_loss": 1.7663601636886597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8151668310165405, + "epoch": 3.23, + "learning_rate": 2.6959304431831907e-05, + "loss": 0.9317, + "step": 3816, + "task_loss": 1.3193262815475464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5841848254203796, + "epoch": 3.23, + "learning_rate": 2.6953266513706077e-05, + "loss": 0.5518, + "step": 3817, + "task_loss": 0.8994879126548767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6144908666610718, + "epoch": 3.23, + "learning_rate": 2.6947228595580244e-05, + "loss": 0.7568, + "step": 3818, + "task_loss": 0.9236342906951904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7763292789459229, + "epoch": 3.23, + "learning_rate": 2.6941190677454415e-05, + "loss": 0.5247, + "step": 3819, + "task_loss": 0.7072175145149231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7275692224502563, + "epoch": 3.23, + "learning_rate": 2.6935152759328586e-05, + "loss": 0.6969, + "step": 3820, + "task_loss": 1.620560884475708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8345743417739868, + "epoch": 3.23, + "learning_rate": 2.6929114841202756e-05, + "loss": 0.7907, + "step": 3821, + "task_loss": 1.094906210899353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8300758600234985, + "epoch": 3.23, + "learning_rate": 2.6923076923076923e-05, + "loss": 0.7788, + "step": 3822, + "task_loss": 1.9478230476379395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5707115530967712, + "epoch": 3.23, + "learning_rate": 2.6917039004951094e-05, + "loss": 0.618, + "step": 3823, + "task_loss": 0.5562289953231812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42580896615982056, + "epoch": 3.23, + "learning_rate": 2.6911001086825264e-05, + "loss": 0.7177, + "step": 3824, + "task_loss": 0.5858167409896851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2559913992881775, + "epoch": 3.23, + "learning_rate": 2.690496316869943e-05, + "loss": 0.6197, + "step": 3825, + "task_loss": 0.1804138869047165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42227593064308167, + "epoch": 3.23, + "learning_rate": 2.6898925250573602e-05, + "loss": 0.54, + "step": 3826, + "task_loss": 0.5198736190795898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6105734705924988, + "epoch": 3.23, + "learning_rate": 2.6892887332447776e-05, + "loss": 0.6633, + "step": 3827, + "task_loss": 0.24023839831352234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7365256547927856, + "epoch": 3.24, + "learning_rate": 2.688684941432194e-05, + "loss": 0.643, + "step": 3828, + "task_loss": 0.6190977096557617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5054594278335571, + "epoch": 3.24, + "learning_rate": 2.688081149619611e-05, + "loss": 0.5725, + "step": 3829, + "task_loss": 0.46155068278312683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4077319800853729, + "epoch": 3.24, + "learning_rate": 2.6874773578070285e-05, + "loss": 0.5678, + "step": 3830, + "task_loss": 0.4196246862411499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3718378245830536, + "epoch": 3.24, + "learning_rate": 2.6868735659944455e-05, + "loss": 0.5403, + "step": 3831, + "task_loss": 0.5666140913963318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9335667490959167, + "epoch": 3.24, + "learning_rate": 2.6862697741818622e-05, + "loss": 0.6919, + "step": 3832, + "task_loss": 1.687600016593933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9736859798431396, + "epoch": 3.24, + "learning_rate": 2.6856659823692793e-05, + "loss": 0.5869, + "step": 3833, + "task_loss": 0.5775043368339539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4921509027481079, + "epoch": 3.24, + "learning_rate": 2.6850621905566963e-05, + "loss": 0.6274, + "step": 3834, + "task_loss": 1.1227810382843018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5715256929397583, + "epoch": 3.24, + "learning_rate": 2.684458398744113e-05, + "loss": 0.7426, + "step": 3835, + "task_loss": 1.2261786460876465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8505398035049438, + "epoch": 3.24, + "learning_rate": 2.68385460693153e-05, + "loss": 0.655, + "step": 3836, + "task_loss": 1.1689717769622803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42035973072052, + "epoch": 3.24, + "learning_rate": 2.6832508151189472e-05, + "loss": 0.5447, + "step": 3837, + "task_loss": 0.6019532680511475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5325599908828735, + "epoch": 3.24, + "learning_rate": 2.682647023306364e-05, + "loss": 0.5946, + "step": 3838, + "task_loss": 0.3388572335243225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2684626877307892, + "epoch": 3.24, + "learning_rate": 2.682043231493781e-05, + "loss": 0.706, + "step": 3839, + "task_loss": 0.2648860216140747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39670324325561523, + "epoch": 3.25, + "learning_rate": 2.681439439681198e-05, + "loss": 0.4397, + "step": 3840, + "task_loss": 0.7919542789459229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8736084699630737, + "epoch": 3.25, + "learning_rate": 2.6808356478686154e-05, + "loss": 0.6537, + "step": 3841, + "task_loss": 1.3946781158447266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6007811427116394, + "epoch": 3.25, + "learning_rate": 2.6802318560560318e-05, + "loss": 0.4864, + "step": 3842, + "task_loss": 0.7827214598655701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.9908289909362793, + "epoch": 3.25, + "learning_rate": 2.6796280642434492e-05, + "loss": 0.9942, + "step": 3843, + "task_loss": 1.1989620923995972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5278843641281128, + "epoch": 3.25, + "learning_rate": 2.6790242724308662e-05, + "loss": 0.6177, + "step": 3844, + "task_loss": 0.8241632580757141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5054795742034912, + "epoch": 3.25, + "learning_rate": 2.6784204806182826e-05, + "loss": 0.6058, + "step": 3845, + "task_loss": 0.39002248644828796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3030523657798767, + "epoch": 3.25, + "learning_rate": 2.6778166888057e-05, + "loss": 0.4318, + "step": 3846, + "task_loss": 0.26502785086631775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4107990264892578, + "epoch": 3.25, + "learning_rate": 2.677212896993117e-05, + "loss": 1.0368, + "step": 3847, + "task_loss": 1.201445460319519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.507205605506897, + "epoch": 3.25, + "learning_rate": 2.6766091051805338e-05, + "loss": 0.6553, + "step": 3848, + "task_loss": 1.0373141765594482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5042811632156372, + "epoch": 3.25, + "learning_rate": 2.676005313367951e-05, + "loss": 0.8051, + "step": 3849, + "task_loss": 1.494383454322815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24265123903751373, + "epoch": 3.25, + "learning_rate": 2.675401521555368e-05, + "loss": 0.4407, + "step": 3850, + "task_loss": 0.044261109083890915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8958820104598999, + "epoch": 3.26, + "learning_rate": 2.674797729742785e-05, + "loss": 0.584, + "step": 3851, + "task_loss": 0.5287253856658936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6337130665779114, + "epoch": 3.26, + "learning_rate": 2.6741939379302017e-05, + "loss": 0.7723, + "step": 3852, + "task_loss": 0.7469601035118103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7162208557128906, + "epoch": 3.26, + "learning_rate": 2.6735901461176187e-05, + "loss": 0.5828, + "step": 3853, + "task_loss": 0.330293744802475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6202374696731567, + "epoch": 3.26, + "learning_rate": 2.672986354305036e-05, + "loss": 0.903, + "step": 3854, + "task_loss": 1.0988820791244507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1776304244995117, + "epoch": 3.26, + "learning_rate": 2.6723825624924525e-05, + "loss": 0.8453, + "step": 3855, + "task_loss": 1.2135603427886963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4645193815231323, + "epoch": 3.26, + "learning_rate": 2.6717787706798696e-05, + "loss": 0.4957, + "step": 3856, + "task_loss": 1.6712590456008911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5928220748901367, + "epoch": 3.26, + "learning_rate": 2.671174978867287e-05, + "loss": 0.4815, + "step": 3857, + "task_loss": 0.3807794451713562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.701501190662384, + "epoch": 3.26, + "learning_rate": 2.6705711870547033e-05, + "loss": 0.6177, + "step": 3858, + "task_loss": 1.458693265914917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9499773979187012, + "epoch": 3.26, + "learning_rate": 2.6699673952421207e-05, + "loss": 0.6705, + "step": 3859, + "task_loss": 0.662256121635437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37579917907714844, + "epoch": 3.26, + "learning_rate": 2.6693636034295378e-05, + "loss": 0.5736, + "step": 3860, + "task_loss": 0.6692442297935486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3625980615615845, + "epoch": 3.26, + "learning_rate": 2.668759811616955e-05, + "loss": 0.7529, + "step": 3861, + "task_loss": 1.7739009857177734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36655858159065247, + "epoch": 3.26, + "learning_rate": 2.6681560198043716e-05, + "loss": 0.5743, + "step": 3862, + "task_loss": 0.3933911919593811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48964786529541016, + "epoch": 3.27, + "learning_rate": 2.6675522279917886e-05, + "loss": 0.5849, + "step": 3863, + "task_loss": 0.4818389117717743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3185902237892151, + "epoch": 3.27, + "learning_rate": 2.6669484361792057e-05, + "loss": 0.4199, + "step": 3864, + "task_loss": 0.4691984951496124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4040016531944275, + "epoch": 3.27, + "learning_rate": 2.6663446443666224e-05, + "loss": 0.5411, + "step": 3865, + "task_loss": 0.5929793119430542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4796018600463867, + "epoch": 3.27, + "learning_rate": 2.6657408525540395e-05, + "loss": 0.6123, + "step": 3866, + "task_loss": 0.5219969749450684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4811915457248688, + "epoch": 3.27, + "learning_rate": 2.6651370607414565e-05, + "loss": 0.6456, + "step": 3867, + "task_loss": 0.44106876850128174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37166595458984375, + "epoch": 3.27, + "learning_rate": 2.6645332689288732e-05, + "loss": 0.4433, + "step": 3868, + "task_loss": 0.21932142972946167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.088750958442688, + "epoch": 3.27, + "learning_rate": 2.6639294771162903e-05, + "loss": 0.7231, + "step": 3869, + "task_loss": 1.2116187810897827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6861915588378906, + "epoch": 3.27, + "learning_rate": 2.6633256853037077e-05, + "loss": 0.8649, + "step": 3870, + "task_loss": 0.9740768671035767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2734610438346863, + "epoch": 3.27, + "learning_rate": 2.6627218934911247e-05, + "loss": 0.5702, + "step": 3871, + "task_loss": 1.1948003768920898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7619774341583252, + "epoch": 3.27, + "learning_rate": 2.662118101678541e-05, + "loss": 0.7651, + "step": 3872, + "task_loss": 0.7293266654014587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.967250406742096, + "epoch": 3.27, + "learning_rate": 2.6615143098659585e-05, + "loss": 0.7331, + "step": 3873, + "task_loss": 0.7539423108100891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39383089542388916, + "epoch": 3.27, + "learning_rate": 2.6609105180533756e-05, + "loss": 0.4121, + "step": 3874, + "task_loss": 0.31008198857307434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3484002351760864, + "epoch": 3.28, + "learning_rate": 2.6603067262407923e-05, + "loss": 0.8422, + "step": 3875, + "task_loss": 0.6652863025665283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6814296245574951, + "epoch": 3.28, + "learning_rate": 2.6597029344282094e-05, + "loss": 0.629, + "step": 3876, + "task_loss": 0.7861500978469849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5805301070213318, + "epoch": 3.28, + "learning_rate": 2.6590991426156264e-05, + "loss": 0.7727, + "step": 3877, + "task_loss": 0.4359031617641449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6087177991867065, + "epoch": 3.28, + "learning_rate": 2.658495350803043e-05, + "loss": 0.5661, + "step": 3878, + "task_loss": 1.3420968055725098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8411298394203186, + "epoch": 3.28, + "learning_rate": 2.6578915589904602e-05, + "loss": 0.5906, + "step": 3879, + "task_loss": 0.3424343168735504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8254624009132385, + "epoch": 3.28, + "learning_rate": 2.6572877671778772e-05, + "loss": 0.743, + "step": 3880, + "task_loss": 2.4015679359436035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3090619146823883, + "epoch": 3.28, + "learning_rate": 2.656683975365294e-05, + "loss": 0.5331, + "step": 3881, + "task_loss": 0.7764829993247986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5496142506599426, + "epoch": 3.28, + "learning_rate": 2.656080183552711e-05, + "loss": 0.7116, + "step": 3882, + "task_loss": 0.4755264222621918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7677972316741943, + "epoch": 3.28, + "learning_rate": 2.655476391740128e-05, + "loss": 0.6032, + "step": 3883, + "task_loss": 0.805168092250824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5757544040679932, + "epoch": 3.28, + "learning_rate": 2.6548725999275455e-05, + "loss": 0.7057, + "step": 3884, + "task_loss": 1.3406319618225098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.507298469543457, + "epoch": 3.28, + "learning_rate": 2.654268808114962e-05, + "loss": 0.6396, + "step": 3885, + "task_loss": 0.09997845441102982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9478985667228699, + "epoch": 3.28, + "learning_rate": 2.653665016302379e-05, + "loss": 0.8329, + "step": 3886, + "task_loss": 0.9102082848548889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6417235136032104, + "epoch": 3.29, + "learning_rate": 2.6530612244897963e-05, + "loss": 0.6198, + "step": 3887, + "task_loss": 0.5683102011680603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.791704535484314, + "epoch": 3.29, + "learning_rate": 2.6524574326772127e-05, + "loss": 0.5936, + "step": 3888, + "task_loss": 1.0623878240585327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6217017769813538, + "epoch": 3.29, + "learning_rate": 2.65185364086463e-05, + "loss": 0.6718, + "step": 3889, + "task_loss": 1.547390341758728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5323266983032227, + "epoch": 3.29, + "learning_rate": 2.651249849052047e-05, + "loss": 0.6094, + "step": 3890, + "task_loss": 0.9643723964691162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4152616858482361, + "epoch": 3.29, + "learning_rate": 2.650646057239464e-05, + "loss": 0.5605, + "step": 3891, + "task_loss": 0.43540284037590027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4223644733428955, + "epoch": 3.29, + "learning_rate": 2.650042265426881e-05, + "loss": 0.485, + "step": 3892, + "task_loss": 0.5213735699653625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4909425377845764, + "epoch": 3.29, + "learning_rate": 2.649438473614298e-05, + "loss": 0.5734, + "step": 3893, + "task_loss": 1.2381582260131836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41863974928855896, + "epoch": 3.29, + "learning_rate": 2.648834681801715e-05, + "loss": 0.5931, + "step": 3894, + "task_loss": 0.4961846172809601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4648692011833191, + "epoch": 3.29, + "learning_rate": 2.6482308899891317e-05, + "loss": 0.5361, + "step": 3895, + "task_loss": 0.6191472411155701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28473180532455444, + "epoch": 3.29, + "learning_rate": 2.6476270981765488e-05, + "loss": 0.5372, + "step": 3896, + "task_loss": 0.705217182636261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.065242886543274, + "epoch": 3.29, + "learning_rate": 2.647023306363966e-05, + "loss": 0.8353, + "step": 3897, + "task_loss": 1.787498950958252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31734955310821533, + "epoch": 3.29, + "learning_rate": 2.6464195145513826e-05, + "loss": 0.5276, + "step": 3898, + "task_loss": 0.9646798968315125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4767318665981293, + "epoch": 3.3, + "learning_rate": 2.6458157227387996e-05, + "loss": 0.7424, + "step": 3899, + "task_loss": 1.5132758617401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6761322021484375, + "epoch": 3.3, + "learning_rate": 2.645211930926217e-05, + "loss": 0.5488, + "step": 3900, + "task_loss": 1.4400570392608643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4853355884552002, + "epoch": 3.3, + "learning_rate": 2.6446081391136334e-05, + "loss": 0.5332, + "step": 3901, + "task_loss": 0.6799299716949463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7361021041870117, + "epoch": 3.3, + "learning_rate": 2.6440043473010505e-05, + "loss": 0.6561, + "step": 3902, + "task_loss": 1.4156526327133179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46797269582748413, + "epoch": 3.3, + "learning_rate": 2.643400555488468e-05, + "loss": 0.6549, + "step": 3903, + "task_loss": 1.5360777378082275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9150091409683228, + "epoch": 3.3, + "learning_rate": 2.642796763675885e-05, + "loss": 0.8199, + "step": 3904, + "task_loss": 1.369651198387146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8130684494972229, + "epoch": 3.3, + "learning_rate": 2.6421929718633016e-05, + "loss": 0.6326, + "step": 3905, + "task_loss": 0.7818535566329956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1520894765853882, + "epoch": 3.3, + "learning_rate": 2.6415891800507187e-05, + "loss": 0.6707, + "step": 3906, + "task_loss": 1.1094388961791992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8521507382392883, + "epoch": 3.3, + "learning_rate": 2.6409853882381357e-05, + "loss": 0.6868, + "step": 3907, + "task_loss": 1.0143516063690186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7187873125076294, + "epoch": 3.3, + "learning_rate": 2.6403815964255525e-05, + "loss": 0.7431, + "step": 3908, + "task_loss": 0.811107873916626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6099079847335815, + "epoch": 3.3, + "learning_rate": 2.6397778046129695e-05, + "loss": 0.689, + "step": 3909, + "task_loss": 0.9018011689186096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7623339295387268, + "epoch": 3.3, + "learning_rate": 2.6391740128003866e-05, + "loss": 0.7054, + "step": 3910, + "task_loss": 0.3750118017196655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.243570014834404, + "epoch": 3.31, + "learning_rate": 2.6385702209878033e-05, + "loss": 0.6818, + "step": 3911, + "task_loss": 0.06974969059228897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.707797646522522, + "epoch": 3.31, + "learning_rate": 2.6379664291752204e-05, + "loss": 0.8053, + "step": 3912, + "task_loss": 0.5301773548126221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5330283641815186, + "epoch": 3.31, + "learning_rate": 2.6373626373626374e-05, + "loss": 0.6056, + "step": 3913, + "task_loss": 0.7068347334861755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3029429316520691, + "epoch": 3.31, + "learning_rate": 2.6367588455500548e-05, + "loss": 0.5441, + "step": 3914, + "task_loss": 0.08207356184720993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5069031715393066, + "epoch": 3.31, + "learning_rate": 2.6361550537374712e-05, + "loss": 0.4646, + "step": 3915, + "task_loss": 0.44526171684265137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6169978380203247, + "epoch": 3.31, + "learning_rate": 2.6355512619248886e-05, + "loss": 0.5319, + "step": 3916, + "task_loss": 0.8421952724456787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47404229640960693, + "epoch": 3.31, + "learning_rate": 2.6349474701123056e-05, + "loss": 0.8043, + "step": 3917, + "task_loss": 0.4813024401664734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4884621202945709, + "epoch": 3.31, + "learning_rate": 2.634343678299722e-05, + "loss": 0.6651, + "step": 3918, + "task_loss": 0.46151483058929443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3522825241088867, + "epoch": 3.31, + "learning_rate": 2.6337398864871394e-05, + "loss": 0.5168, + "step": 3919, + "task_loss": 1.1924980878829956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5214714407920837, + "epoch": 3.31, + "learning_rate": 2.6331360946745565e-05, + "loss": 0.5938, + "step": 3920, + "task_loss": 0.6074795126914978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9342019557952881, + "epoch": 3.31, + "learning_rate": 2.6325323028619732e-05, + "loss": 0.8019, + "step": 3921, + "task_loss": 0.19990885257720947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5792969465255737, + "epoch": 3.32, + "learning_rate": 2.6319285110493903e-05, + "loss": 0.6307, + "step": 3922, + "task_loss": 1.1586164236068726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5641287565231323, + "epoch": 3.32, + "learning_rate": 2.6313247192368073e-05, + "loss": 0.6074, + "step": 3923, + "task_loss": 1.4743331670761108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.674136221408844, + "epoch": 3.32, + "learning_rate": 2.6307209274242244e-05, + "loss": 0.6302, + "step": 3924, + "task_loss": 1.9525656700134277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33975762128829956, + "epoch": 3.32, + "learning_rate": 2.630117135611641e-05, + "loss": 0.6684, + "step": 3925, + "task_loss": 0.7077212929725647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5714633464813232, + "epoch": 3.32, + "learning_rate": 2.629513343799058e-05, + "loss": 0.6142, + "step": 3926, + "task_loss": 0.48205330967903137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4228416085243225, + "epoch": 3.32, + "learning_rate": 2.6289095519864755e-05, + "loss": 0.5963, + "step": 3927, + "task_loss": 0.2441185861825943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5860397815704346, + "epoch": 3.32, + "learning_rate": 2.628305760173892e-05, + "loss": 0.5328, + "step": 3928, + "task_loss": 0.4942067861557007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2570850849151611, + "epoch": 3.32, + "learning_rate": 2.627701968361309e-05, + "loss": 0.7698, + "step": 3929, + "task_loss": 0.8795751333236694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40958184003829956, + "epoch": 3.32, + "learning_rate": 2.6270981765487264e-05, + "loss": 0.5681, + "step": 3930, + "task_loss": 0.7087894082069397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49124422669410706, + "epoch": 3.32, + "learning_rate": 2.6264943847361427e-05, + "loss": 0.5641, + "step": 3931, + "task_loss": 0.25018155574798584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5732605457305908, + "epoch": 3.32, + "learning_rate": 2.62589059292356e-05, + "loss": 0.5875, + "step": 3932, + "task_loss": 0.1319923847913742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5042264461517334, + "epoch": 3.32, + "learning_rate": 2.6252868011109772e-05, + "loss": 0.6762, + "step": 3933, + "task_loss": 0.7675753831863403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7829902172088623, + "epoch": 3.33, + "learning_rate": 2.6246830092983943e-05, + "loss": 0.5766, + "step": 3934, + "task_loss": 0.9722535014152527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5832582712173462, + "epoch": 3.33, + "learning_rate": 2.624079217485811e-05, + "loss": 0.6383, + "step": 3935, + "task_loss": 0.5819710493087769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5074308514595032, + "epoch": 3.33, + "learning_rate": 2.623475425673228e-05, + "loss": 0.7085, + "step": 3936, + "task_loss": 1.235790491104126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47025662660598755, + "epoch": 3.33, + "learning_rate": 2.622871633860645e-05, + "loss": 0.547, + "step": 3937, + "task_loss": 0.3245030641555786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5935146808624268, + "epoch": 3.33, + "learning_rate": 2.6222678420480618e-05, + "loss": 0.633, + "step": 3938, + "task_loss": 0.9082158207893372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5476855039596558, + "epoch": 3.33, + "learning_rate": 2.621664050235479e-05, + "loss": 0.3847, + "step": 3939, + "task_loss": 0.7572515606880188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6679243445396423, + "epoch": 3.33, + "learning_rate": 2.621060258422896e-05, + "loss": 0.6444, + "step": 3940, + "task_loss": 0.16115054488182068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36852210760116577, + "epoch": 3.33, + "learning_rate": 2.6204564666103126e-05, + "loss": 0.6391, + "step": 3941, + "task_loss": 0.834308385848999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8478296399116516, + "epoch": 3.33, + "learning_rate": 2.6198526747977297e-05, + "loss": 0.5716, + "step": 3942, + "task_loss": 1.5304219722747803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4448738992214203, + "epoch": 3.33, + "learning_rate": 2.6192488829851468e-05, + "loss": 0.519, + "step": 3943, + "task_loss": 0.16034835577011108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5501660108566284, + "epoch": 3.33, + "learning_rate": 2.618645091172564e-05, + "loss": 0.5141, + "step": 3944, + "task_loss": 1.425825595855713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2517452538013458, + "epoch": 3.33, + "learning_rate": 2.6180412993599805e-05, + "loss": 0.578, + "step": 3945, + "task_loss": 0.689324676990509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7069664001464844, + "epoch": 3.34, + "learning_rate": 2.617437507547398e-05, + "loss": 0.624, + "step": 3946, + "task_loss": 0.7517654299736023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43849098682403564, + "epoch": 3.34, + "learning_rate": 2.616833715734815e-05, + "loss": 0.5039, + "step": 3947, + "task_loss": 0.15890027582645416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34070831537246704, + "epoch": 3.34, + "learning_rate": 2.6162299239222317e-05, + "loss": 0.5948, + "step": 3948, + "task_loss": 0.4755551815032959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5503568649291992, + "epoch": 3.34, + "learning_rate": 2.6156261321096488e-05, + "loss": 0.6308, + "step": 3949, + "task_loss": 0.5704407691955566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39978650212287903, + "epoch": 3.34, + "learning_rate": 2.6150223402970658e-05, + "loss": 0.7811, + "step": 3950, + "task_loss": 0.5388373732566833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2862473726272583, + "epoch": 3.34, + "learning_rate": 2.6144185484844825e-05, + "loss": 0.3045, + "step": 3951, + "task_loss": 0.8936640024185181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9583165049552917, + "epoch": 3.34, + "learning_rate": 2.6138147566718996e-05, + "loss": 0.6627, + "step": 3952, + "task_loss": 1.2962121963500977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5983402132987976, + "epoch": 3.34, + "learning_rate": 2.6132109648593166e-05, + "loss": 0.6773, + "step": 3953, + "task_loss": 0.20013566315174103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.671189546585083, + "epoch": 3.34, + "learning_rate": 2.6126071730467337e-05, + "loss": 0.578, + "step": 3954, + "task_loss": 0.44599011540412903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4950448274612427, + "epoch": 3.34, + "learning_rate": 2.6120033812341504e-05, + "loss": 0.7887, + "step": 3955, + "task_loss": 1.0941587686538696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5663962364196777, + "epoch": 3.34, + "learning_rate": 2.6113995894215675e-05, + "loss": 0.5225, + "step": 3956, + "task_loss": 0.7047349810600281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5218271017074585, + "epoch": 3.34, + "learning_rate": 2.610795797608985e-05, + "loss": 0.6904, + "step": 3957, + "task_loss": 0.5950703620910645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6578766107559204, + "epoch": 3.35, + "learning_rate": 2.6101920057964013e-05, + "loss": 0.7838, + "step": 3958, + "task_loss": 0.9733827114105225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5907174944877625, + "epoch": 3.35, + "learning_rate": 2.6095882139838183e-05, + "loss": 0.7218, + "step": 3959, + "task_loss": 0.468867689371109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6422519087791443, + "epoch": 3.35, + "learning_rate": 2.6089844221712357e-05, + "loss": 0.5173, + "step": 3960, + "task_loss": 0.11491935700178146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5063284635543823, + "epoch": 3.35, + "learning_rate": 2.608380630358652e-05, + "loss": 0.644, + "step": 3961, + "task_loss": 0.7304311394691467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3663907051086426, + "epoch": 3.35, + "learning_rate": 2.6077768385460695e-05, + "loss": 0.4794, + "step": 3962, + "task_loss": 0.44228166341781616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4637787938117981, + "epoch": 3.35, + "learning_rate": 2.6071730467334865e-05, + "loss": 0.6119, + "step": 3963, + "task_loss": 1.0656816959381104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5052222609519958, + "epoch": 3.35, + "learning_rate": 2.6065692549209036e-05, + "loss": 0.653, + "step": 3964, + "task_loss": 0.8747231364250183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7805315256118774, + "epoch": 3.35, + "learning_rate": 2.6059654631083203e-05, + "loss": 0.5161, + "step": 3965, + "task_loss": 0.6256901025772095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8468225002288818, + "epoch": 3.35, + "learning_rate": 2.6053616712957374e-05, + "loss": 0.5186, + "step": 3966, + "task_loss": 0.7385403513908386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8091645240783691, + "epoch": 3.35, + "learning_rate": 2.6047578794831544e-05, + "loss": 0.5935, + "step": 3967, + "task_loss": 2.0240392684936523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0832350254058838, + "epoch": 3.35, + "learning_rate": 2.604154087670571e-05, + "loss": 0.8204, + "step": 3968, + "task_loss": 1.5370038747787476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49045324325561523, + "epoch": 3.35, + "learning_rate": 2.6035502958579882e-05, + "loss": 0.5968, + "step": 3969, + "task_loss": 1.171879768371582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5738171935081482, + "epoch": 3.36, + "learning_rate": 2.6029465040454053e-05, + "loss": 0.6297, + "step": 3970, + "task_loss": 0.08775167167186737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8335238099098206, + "epoch": 3.36, + "learning_rate": 2.602342712232822e-05, + "loss": 0.7244, + "step": 3971, + "task_loss": 1.4658639430999756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7119938135147095, + "epoch": 3.36, + "learning_rate": 2.601738920420239e-05, + "loss": 0.6659, + "step": 3972, + "task_loss": 0.8703680038452148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39360442757606506, + "epoch": 3.36, + "learning_rate": 2.6011351286076564e-05, + "loss": 0.7602, + "step": 3973, + "task_loss": 0.47015106678009033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6743742227554321, + "epoch": 3.36, + "learning_rate": 2.6005313367950735e-05, + "loss": 0.6371, + "step": 3974, + "task_loss": 1.0348647832870483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31033754348754883, + "epoch": 3.36, + "learning_rate": 2.59992754498249e-05, + "loss": 0.5168, + "step": 3975, + "task_loss": 0.9953373670578003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0974111557006836, + "epoch": 3.36, + "learning_rate": 2.5993237531699073e-05, + "loss": 0.7893, + "step": 3976, + "task_loss": 0.6379475593566895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4914453625679016, + "epoch": 3.36, + "learning_rate": 2.5987199613573243e-05, + "loss": 0.6733, + "step": 3977, + "task_loss": 0.6056598424911499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42430174350738525, + "epoch": 3.36, + "learning_rate": 2.598116169544741e-05, + "loss": 0.5241, + "step": 3978, + "task_loss": 0.5424736142158508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5069644451141357, + "epoch": 3.36, + "learning_rate": 2.597512377732158e-05, + "loss": 0.6371, + "step": 3979, + "task_loss": 0.6619279384613037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.78520667552948, + "epoch": 3.36, + "learning_rate": 2.596908585919575e-05, + "loss": 0.7728, + "step": 3980, + "task_loss": 0.43812429904937744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.316731333732605, + "epoch": 3.36, + "learning_rate": 2.596304794106992e-05, + "loss": 0.639, + "step": 3981, + "task_loss": 0.12956060469150543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6818212270736694, + "epoch": 3.37, + "learning_rate": 2.595701002294409e-05, + "loss": 0.5799, + "step": 3982, + "task_loss": 0.6076228022575378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.65285325050354, + "epoch": 3.37, + "learning_rate": 2.595097210481826e-05, + "loss": 0.6448, + "step": 3983, + "task_loss": 0.876469075679779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6837882995605469, + "epoch": 3.37, + "learning_rate": 2.5944934186692434e-05, + "loss": 0.6658, + "step": 3984, + "task_loss": 2.5778963565826416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4796503186225891, + "epoch": 3.37, + "learning_rate": 2.5938896268566598e-05, + "loss": 0.6725, + "step": 3985, + "task_loss": 0.48694297671318054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46068698167800903, + "epoch": 3.37, + "learning_rate": 2.5932858350440768e-05, + "loss": 0.6342, + "step": 3986, + "task_loss": 1.5280755758285522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5136503577232361, + "epoch": 3.37, + "learning_rate": 2.5926820432314942e-05, + "loss": 0.623, + "step": 3987, + "task_loss": 0.7039515972137451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3499181270599365, + "epoch": 3.37, + "learning_rate": 2.5920782514189106e-05, + "loss": 0.5471, + "step": 3988, + "task_loss": 0.26938170194625854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7411595582962036, + "epoch": 3.37, + "learning_rate": 2.591474459606328e-05, + "loss": 0.6261, + "step": 3989, + "task_loss": 0.9704373478889465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6775782108306885, + "epoch": 3.37, + "learning_rate": 2.590870667793745e-05, + "loss": 0.6907, + "step": 3990, + "task_loss": 1.0495789051055908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8890604972839355, + "epoch": 3.37, + "learning_rate": 2.5902668759811614e-05, + "loss": 0.7579, + "step": 3991, + "task_loss": 0.6684366464614868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4833575487136841, + "epoch": 3.37, + "learning_rate": 2.5896630841685788e-05, + "loss": 0.5072, + "step": 3992, + "task_loss": 0.5276519060134888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8928574919700623, + "epoch": 3.38, + "learning_rate": 2.589059292355996e-05, + "loss": 0.7992, + "step": 3993, + "task_loss": 0.5278645753860474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1955161690711975, + "epoch": 3.38, + "learning_rate": 2.588455500543413e-05, + "loss": 0.5209, + "step": 3994, + "task_loss": 0.2663785219192505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5638626217842102, + "epoch": 3.38, + "learning_rate": 2.5878517087308297e-05, + "loss": 0.5906, + "step": 3995, + "task_loss": 0.5938208699226379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5434603095054626, + "epoch": 3.38, + "learning_rate": 2.5872479169182467e-05, + "loss": 0.5963, + "step": 3996, + "task_loss": 0.47569167613983154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8213906288146973, + "epoch": 3.38, + "learning_rate": 2.5866441251056638e-05, + "loss": 0.7028, + "step": 3997, + "task_loss": 1.6206679344177246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3728753924369812, + "epoch": 3.38, + "learning_rate": 2.5860403332930805e-05, + "loss": 0.5721, + "step": 3998, + "task_loss": 0.9220856428146362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9597940444946289, + "epoch": 3.38, + "learning_rate": 2.5854365414804975e-05, + "loss": 0.6385, + "step": 3999, + "task_loss": 0.7873920202255249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38649553060531616, + "epoch": 3.38, + "learning_rate": 2.584832749667915e-05, + "loss": 0.52, + "step": 4000, + "task_loss": 0.7521729469299316 + }, + { + "epoch": 3.38, + "eval_accuracy": 0.8973861386138614, + "eval_loss": 0.39273032546043396, + "eval_runtime": 227.3471, + "eval_samples_per_second": 111.064, + "eval_steps_per_second": 0.871, + "step": 4000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5619375109672546, + "epoch": 3.38, + "learning_rate": 2.5842289578553313e-05, + "loss": 0.5681, + "step": 4001, + "task_loss": 0.8012285232543945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8093260526657104, + "epoch": 3.38, + "learning_rate": 2.5836251660427484e-05, + "loss": 0.6435, + "step": 4002, + "task_loss": 0.6069123148918152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.964133083820343, + "epoch": 3.38, + "learning_rate": 2.5830213742301658e-05, + "loss": 0.6817, + "step": 4003, + "task_loss": 1.0303255319595337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4471619427204132, + "epoch": 3.38, + "learning_rate": 2.582417582417583e-05, + "loss": 0.4236, + "step": 4004, + "task_loss": 0.6342084407806396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8036463260650635, + "epoch": 3.39, + "learning_rate": 2.5818137906049996e-05, + "loss": 0.6716, + "step": 4005, + "task_loss": 0.7874033451080322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5635591745376587, + "epoch": 3.39, + "learning_rate": 2.5812099987924166e-05, + "loss": 0.4473, + "step": 4006, + "task_loss": 0.7093924283981323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.556754231452942, + "epoch": 3.39, + "learning_rate": 2.5806062069798337e-05, + "loss": 0.7823, + "step": 4007, + "task_loss": 0.6224846839904785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6854926347732544, + "epoch": 3.39, + "learning_rate": 2.5800024151672504e-05, + "loss": 0.7555, + "step": 4008, + "task_loss": 0.69862961769104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1127930879592896, + "epoch": 3.39, + "learning_rate": 2.5793986233546674e-05, + "loss": 0.7784, + "step": 4009, + "task_loss": 0.9319111108779907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44248420000076294, + "epoch": 3.39, + "learning_rate": 2.5787948315420845e-05, + "loss": 0.5076, + "step": 4010, + "task_loss": 0.4517652988433838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6492220759391785, + "epoch": 3.39, + "learning_rate": 2.5781910397295012e-05, + "loss": 0.6031, + "step": 4011, + "task_loss": 0.556274950504303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40293532609939575, + "epoch": 3.39, + "learning_rate": 2.5775872479169183e-05, + "loss": 0.4453, + "step": 4012, + "task_loss": 0.9317710399627686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6116511225700378, + "epoch": 3.39, + "learning_rate": 2.5769834561043353e-05, + "loss": 0.6725, + "step": 4013, + "task_loss": 2.324150800704956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.479971319437027, + "epoch": 3.39, + "learning_rate": 2.5763796642917527e-05, + "loss": 0.6237, + "step": 4014, + "task_loss": 0.7184808254241943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5763856172561646, + "epoch": 3.39, + "learning_rate": 2.575775872479169e-05, + "loss": 0.6132, + "step": 4015, + "task_loss": 0.7311533093452454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6479064226150513, + "epoch": 3.39, + "learning_rate": 2.575172080666586e-05, + "loss": 0.5534, + "step": 4016, + "task_loss": 0.7794193625450134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4574090540409088, + "epoch": 3.4, + "learning_rate": 2.5745682888540036e-05, + "loss": 0.6907, + "step": 4017, + "task_loss": 0.17865455150604248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6432714462280273, + "epoch": 3.4, + "learning_rate": 2.57396449704142e-05, + "loss": 0.6805, + "step": 4018, + "task_loss": 0.649492084980011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.685693621635437, + "epoch": 3.4, + "learning_rate": 2.5733607052288373e-05, + "loss": 0.7439, + "step": 4019, + "task_loss": 1.2685801982879639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6344225406646729, + "epoch": 3.4, + "learning_rate": 2.5727569134162544e-05, + "loss": 0.5662, + "step": 4020, + "task_loss": 0.2664928734302521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6037537455558777, + "epoch": 3.4, + "learning_rate": 2.572153121603671e-05, + "loss": 0.6707, + "step": 4021, + "task_loss": 0.5969308614730835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7958155870437622, + "epoch": 3.4, + "learning_rate": 2.5715493297910882e-05, + "loss": 0.8082, + "step": 4022, + "task_loss": 0.5687426328659058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45709228515625, + "epoch": 3.4, + "learning_rate": 2.5709455379785052e-05, + "loss": 0.5067, + "step": 4023, + "task_loss": 0.1992705762386322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9010422825813293, + "epoch": 3.4, + "learning_rate": 2.5703417461659223e-05, + "loss": 0.755, + "step": 4024, + "task_loss": 0.7553755640983582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3833635449409485, + "epoch": 3.4, + "learning_rate": 2.569737954353339e-05, + "loss": 0.7304, + "step": 4025, + "task_loss": 0.6125249862670898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5408085584640503, + "epoch": 3.4, + "learning_rate": 2.569134162540756e-05, + "loss": 0.6574, + "step": 4026, + "task_loss": 0.7149351239204407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8328060507774353, + "epoch": 3.4, + "learning_rate": 2.568530370728173e-05, + "loss": 0.8334, + "step": 4027, + "task_loss": 1.4344416856765747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4606068730354309, + "epoch": 3.4, + "learning_rate": 2.56792657891559e-05, + "loss": 0.6398, + "step": 4028, + "task_loss": 1.458067774772644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6227245330810547, + "epoch": 3.41, + "learning_rate": 2.567322787103007e-05, + "loss": 0.5176, + "step": 4029, + "task_loss": 0.8357695937156677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43120628595352173, + "epoch": 3.41, + "learning_rate": 2.5667189952904243e-05, + "loss": 0.6126, + "step": 4030, + "task_loss": 0.48504894971847534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0483330488204956, + "epoch": 3.41, + "learning_rate": 2.5661152034778407e-05, + "loss": 0.792, + "step": 4031, + "task_loss": 0.6739441156387329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8086826801300049, + "epoch": 3.41, + "learning_rate": 2.5655114116652577e-05, + "loss": 0.7237, + "step": 4032, + "task_loss": 0.9790715575218201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6444809436798096, + "epoch": 3.41, + "learning_rate": 2.564907619852675e-05, + "loss": 0.6289, + "step": 4033, + "task_loss": 0.388263076543808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42495161294937134, + "epoch": 3.41, + "learning_rate": 2.5643038280400922e-05, + "loss": 0.6366, + "step": 4034, + "task_loss": 1.5956201553344727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8781970739364624, + "epoch": 3.41, + "learning_rate": 2.563700036227509e-05, + "loss": 0.6764, + "step": 4035, + "task_loss": 1.2105929851531982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8940562605857849, + "epoch": 3.41, + "learning_rate": 2.563096244414926e-05, + "loss": 0.6472, + "step": 4036, + "task_loss": 1.3130667209625244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7000995874404907, + "epoch": 3.41, + "learning_rate": 2.562492452602343e-05, + "loss": 0.6387, + "step": 4037, + "task_loss": 0.516766369342804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0856273174285889, + "epoch": 3.41, + "learning_rate": 2.5618886607897597e-05, + "loss": 0.7653, + "step": 4038, + "task_loss": 1.5353285074234009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45068103075027466, + "epoch": 3.41, + "learning_rate": 2.5612848689771768e-05, + "loss": 0.6586, + "step": 4039, + "task_loss": 1.0589542388916016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3232635259628296, + "epoch": 3.41, + "learning_rate": 2.560681077164594e-05, + "loss": 0.4381, + "step": 4040, + "task_loss": 0.5172564387321472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9166985750198364, + "epoch": 3.42, + "learning_rate": 2.5600772853520106e-05, + "loss": 0.6915, + "step": 4041, + "task_loss": 1.1762224435806274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35645991563796997, + "epoch": 3.42, + "learning_rate": 2.5594734935394276e-05, + "loss": 0.6033, + "step": 4042, + "task_loss": 0.3474424481391907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3267214894294739, + "epoch": 3.42, + "learning_rate": 2.5588697017268447e-05, + "loss": 0.6968, + "step": 4043, + "task_loss": 0.8212542533874512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.777168869972229, + "epoch": 3.42, + "learning_rate": 2.558265909914262e-05, + "loss": 0.4834, + "step": 4044, + "task_loss": 0.22739778459072113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7606520652770996, + "epoch": 3.42, + "learning_rate": 2.5576621181016785e-05, + "loss": 0.5895, + "step": 4045, + "task_loss": 0.24477140605449677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6149568557739258, + "epoch": 3.42, + "learning_rate": 2.557058326289096e-05, + "loss": 0.5491, + "step": 4046, + "task_loss": 0.9849258065223694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3711242079734802, + "epoch": 3.42, + "learning_rate": 2.556454534476513e-05, + "loss": 0.5893, + "step": 4047, + "task_loss": 0.6251998543739319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3993990421295166, + "epoch": 3.42, + "learning_rate": 2.5558507426639293e-05, + "loss": 0.536, + "step": 4048, + "task_loss": 0.4073074460029602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5568403601646423, + "epoch": 3.42, + "learning_rate": 2.5552469508513467e-05, + "loss": 0.6903, + "step": 4049, + "task_loss": 0.34182626008987427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45057666301727295, + "epoch": 3.42, + "learning_rate": 2.5546431590387637e-05, + "loss": 0.7054, + "step": 4050, + "task_loss": 0.7158098220825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.627739429473877, + "epoch": 3.42, + "learning_rate": 2.5540393672261805e-05, + "loss": 0.6728, + "step": 4051, + "task_loss": 1.679520845413208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44601646065711975, + "epoch": 3.42, + "learning_rate": 2.5534355754135975e-05, + "loss": 0.8032, + "step": 4052, + "task_loss": 0.4958054721355438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6786448359489441, + "epoch": 3.43, + "learning_rate": 2.5528317836010146e-05, + "loss": 0.5551, + "step": 4053, + "task_loss": 0.49955958127975464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2151453495025635, + "epoch": 3.43, + "learning_rate": 2.5522279917884313e-05, + "loss": 0.7328, + "step": 4054, + "task_loss": 0.924723744392395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3481141924858093, + "epoch": 3.43, + "learning_rate": 2.5516241999758483e-05, + "loss": 0.5074, + "step": 4055, + "task_loss": 0.6574508547782898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41558998823165894, + "epoch": 3.43, + "learning_rate": 2.5510204081632654e-05, + "loss": 0.656, + "step": 4056, + "task_loss": 0.8131816983222961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4063583016395569, + "epoch": 3.43, + "learning_rate": 2.5504166163506828e-05, + "loss": 0.467, + "step": 4057, + "task_loss": 0.5216743350028992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7392359972000122, + "epoch": 3.43, + "learning_rate": 2.5498128245380992e-05, + "loss": 0.5464, + "step": 4058, + "task_loss": 0.5014329552650452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3512876331806183, + "epoch": 3.43, + "learning_rate": 2.5492090327255162e-05, + "loss": 0.4719, + "step": 4059, + "task_loss": 0.825005829334259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48547133803367615, + "epoch": 3.43, + "learning_rate": 2.5486052409129336e-05, + "loss": 0.5147, + "step": 4060, + "task_loss": 0.4620617926120758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3409923017024994, + "epoch": 3.43, + "learning_rate": 2.54800144910035e-05, + "loss": 0.5508, + "step": 4061, + "task_loss": 0.12199026346206665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4654749929904938, + "epoch": 3.43, + "learning_rate": 2.5473976572877674e-05, + "loss": 0.6074, + "step": 4062, + "task_loss": 0.5657976269721985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2463016510009766, + "epoch": 3.43, + "learning_rate": 2.5467938654751845e-05, + "loss": 0.7483, + "step": 4063, + "task_loss": 0.7868435978889465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4532428979873657, + "epoch": 3.44, + "learning_rate": 2.546190073662601e-05, + "loss": 0.6948, + "step": 4064, + "task_loss": 0.5779270529747009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6377803087234497, + "epoch": 3.44, + "learning_rate": 2.5455862818500182e-05, + "loss": 0.5841, + "step": 4065, + "task_loss": 0.5476099252700806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6959567666053772, + "epoch": 3.44, + "learning_rate": 2.5449824900374353e-05, + "loss": 0.6357, + "step": 4066, + "task_loss": 1.1215462684631348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8884621262550354, + "epoch": 3.44, + "learning_rate": 2.5443786982248524e-05, + "loss": 0.6233, + "step": 4067, + "task_loss": 1.4239797592163086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3187132477760315, + "epoch": 3.44, + "learning_rate": 2.543774906412269e-05, + "loss": 0.7261, + "step": 4068, + "task_loss": 0.48107361793518066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.587197482585907, + "epoch": 3.44, + "learning_rate": 2.543171114599686e-05, + "loss": 0.6108, + "step": 4069, + "task_loss": 0.16340531408786774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38556361198425293, + "epoch": 3.44, + "learning_rate": 2.5425673227871032e-05, + "loss": 0.5954, + "step": 4070, + "task_loss": 0.8683832883834839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7441391944885254, + "epoch": 3.44, + "learning_rate": 2.54196353097452e-05, + "loss": 0.6779, + "step": 4071, + "task_loss": 1.5125949382781982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5571364164352417, + "epoch": 3.44, + "learning_rate": 2.541359739161937e-05, + "loss": 0.5353, + "step": 4072, + "task_loss": 1.4887259006500244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5537574291229248, + "epoch": 3.44, + "learning_rate": 2.540755947349354e-05, + "loss": 0.528, + "step": 4073, + "task_loss": 0.7559567093849182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4392712414264679, + "epoch": 3.44, + "learning_rate": 2.5401521555367707e-05, + "loss": 0.4894, + "step": 4074, + "task_loss": 0.8013269305229187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7243004441261292, + "epoch": 3.44, + "learning_rate": 2.5395483637241878e-05, + "loss": 0.6391, + "step": 4075, + "task_loss": 1.3908051252365112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32406771183013916, + "epoch": 3.45, + "learning_rate": 2.5389445719116052e-05, + "loss": 0.3039, + "step": 4076, + "task_loss": 0.14536717534065247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5103085041046143, + "epoch": 3.45, + "learning_rate": 2.5383407800990222e-05, + "loss": 0.4905, + "step": 4077, + "task_loss": 0.9136930704116821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37917008996009827, + "epoch": 3.45, + "learning_rate": 2.537736988286439e-05, + "loss": 0.498, + "step": 4078, + "task_loss": 0.68227219581604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8792146444320679, + "epoch": 3.45, + "learning_rate": 2.537133196473856e-05, + "loss": 0.6489, + "step": 4079, + "task_loss": 0.9473459720611572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.779862642288208, + "epoch": 3.45, + "learning_rate": 2.536529404661273e-05, + "loss": 0.7844, + "step": 4080, + "task_loss": 0.37694743275642395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4561316967010498, + "epoch": 3.45, + "learning_rate": 2.5359256128486898e-05, + "loss": 0.6186, + "step": 4081, + "task_loss": 1.2141536474227905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7657855749130249, + "epoch": 3.45, + "learning_rate": 2.535321821036107e-05, + "loss": 0.8638, + "step": 4082, + "task_loss": 1.3844389915466309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24431322515010834, + "epoch": 3.45, + "learning_rate": 2.534718029223524e-05, + "loss": 0.5785, + "step": 4083, + "task_loss": 0.25010982155799866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49407440423965454, + "epoch": 3.45, + "learning_rate": 2.5341142374109406e-05, + "loss": 0.7149, + "step": 4084, + "task_loss": 0.5499097108840942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5480067729949951, + "epoch": 3.45, + "learning_rate": 2.5335104455983577e-05, + "loss": 0.4441, + "step": 4085, + "task_loss": 0.8086363077163696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5183594822883606, + "epoch": 3.45, + "learning_rate": 2.5329066537857747e-05, + "loss": 0.6751, + "step": 4086, + "task_loss": 0.7493042945861816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6638196110725403, + "epoch": 3.45, + "learning_rate": 2.532302861973192e-05, + "loss": 0.5771, + "step": 4087, + "task_loss": 0.6907023787498474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8202665448188782, + "epoch": 3.46, + "learning_rate": 2.5316990701606085e-05, + "loss": 0.8535, + "step": 4088, + "task_loss": 0.6950072646141052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5491451025009155, + "epoch": 3.46, + "learning_rate": 2.5310952783480256e-05, + "loss": 0.4876, + "step": 4089, + "task_loss": 0.7841203212738037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5857775211334229, + "epoch": 3.46, + "learning_rate": 2.530491486535443e-05, + "loss": 0.6818, + "step": 4090, + "task_loss": 1.331052541732788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6425918340682983, + "epoch": 3.46, + "learning_rate": 2.5298876947228594e-05, + "loss": 0.6916, + "step": 4091, + "task_loss": 0.4713674783706665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.549384593963623, + "epoch": 3.46, + "learning_rate": 2.5292839029102767e-05, + "loss": 0.7336, + "step": 4092, + "task_loss": 0.6088681221008301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39014825224876404, + "epoch": 3.46, + "learning_rate": 2.5286801110976938e-05, + "loss": 0.5453, + "step": 4093, + "task_loss": 0.31069236993789673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2622162401676178, + "epoch": 3.46, + "learning_rate": 2.5280763192851102e-05, + "loss": 0.558, + "step": 4094, + "task_loss": 0.29418930411338806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6795892715454102, + "epoch": 3.46, + "learning_rate": 2.5274725274725276e-05, + "loss": 0.6031, + "step": 4095, + "task_loss": 1.8265482187271118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23804152011871338, + "epoch": 3.46, + "learning_rate": 2.5268687356599446e-05, + "loss": 0.5664, + "step": 4096, + "task_loss": 0.10905683040618896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36927133798599243, + "epoch": 3.46, + "learning_rate": 2.5262649438473617e-05, + "loss": 0.5172, + "step": 4097, + "task_loss": 0.29389524459838867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3518790006637573, + "epoch": 3.46, + "learning_rate": 2.5256611520347784e-05, + "loss": 0.4797, + "step": 4098, + "task_loss": 0.4755506217479706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9117350578308105, + "epoch": 3.46, + "learning_rate": 2.5250573602221955e-05, + "loss": 0.7022, + "step": 4099, + "task_loss": 0.7956864833831787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4939345419406891, + "epoch": 3.47, + "learning_rate": 2.5244535684096125e-05, + "loss": 0.5168, + "step": 4100, + "task_loss": 0.23132286965847015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7668377161026001, + "epoch": 3.47, + "learning_rate": 2.5238497765970292e-05, + "loss": 0.7647, + "step": 4101, + "task_loss": 0.3080413341522217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6111080646514893, + "epoch": 3.47, + "learning_rate": 2.5232459847844463e-05, + "loss": 0.4855, + "step": 4102, + "task_loss": 0.6244467496871948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4321189820766449, + "epoch": 3.47, + "learning_rate": 2.5226421929718637e-05, + "loss": 0.6329, + "step": 4103, + "task_loss": 0.9587082862854004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5987145900726318, + "epoch": 3.47, + "learning_rate": 2.52203840115928e-05, + "loss": 0.6725, + "step": 4104, + "task_loss": 0.7074791789054871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6524959802627563, + "epoch": 3.47, + "learning_rate": 2.521434609346697e-05, + "loss": 0.6426, + "step": 4105, + "task_loss": 0.8801628351211548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6409686207771301, + "epoch": 3.47, + "learning_rate": 2.5208308175341145e-05, + "loss": 0.5967, + "step": 4106, + "task_loss": 0.3904375731945038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40125250816345215, + "epoch": 3.47, + "learning_rate": 2.5202270257215316e-05, + "loss": 0.5681, + "step": 4107, + "task_loss": 0.8492122292518616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9124498963356018, + "epoch": 3.47, + "learning_rate": 2.5196232339089483e-05, + "loss": 0.6647, + "step": 4108, + "task_loss": 0.7564025521278381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8590701818466187, + "epoch": 3.47, + "learning_rate": 2.5190194420963654e-05, + "loss": 0.5719, + "step": 4109, + "task_loss": 0.5583292245864868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5360000133514404, + "epoch": 3.47, + "learning_rate": 2.5184156502837824e-05, + "loss": 0.5825, + "step": 4110, + "task_loss": 0.5857567191123962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.757627010345459, + "epoch": 3.47, + "learning_rate": 2.517811858471199e-05, + "loss": 0.6933, + "step": 4111, + "task_loss": 0.3940114378929138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7134948968887329, + "epoch": 3.48, + "learning_rate": 2.5172080666586162e-05, + "loss": 0.6093, + "step": 4112, + "task_loss": 0.429108589887619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44072428345680237, + "epoch": 3.48, + "learning_rate": 2.5166042748460333e-05, + "loss": 0.533, + "step": 4113, + "task_loss": 1.0380498170852661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.573554277420044, + "epoch": 3.48, + "learning_rate": 2.51600048303345e-05, + "loss": 0.5996, + "step": 4114, + "task_loss": 0.3197292983531952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5454357862472534, + "epoch": 3.48, + "learning_rate": 2.515396691220867e-05, + "loss": 0.625, + "step": 4115, + "task_loss": 0.222787007689476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1879358291625977, + "epoch": 3.48, + "learning_rate": 2.514792899408284e-05, + "loss": 0.6776, + "step": 4116, + "task_loss": 1.277449369430542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4026101231575012, + "epoch": 3.48, + "learning_rate": 2.5141891075957015e-05, + "loss": 0.5022, + "step": 4117, + "task_loss": 0.4813377261161804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6002867221832275, + "epoch": 3.48, + "learning_rate": 2.513585315783118e-05, + "loss": 0.7186, + "step": 4118, + "task_loss": 0.7160503268241882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48108774423599243, + "epoch": 3.48, + "learning_rate": 2.5129815239705353e-05, + "loss": 0.4579, + "step": 4119, + "task_loss": 1.2436598539352417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2867410182952881, + "epoch": 3.48, + "learning_rate": 2.5123777321579523e-05, + "loss": 0.5922, + "step": 4120, + "task_loss": 0.8729248046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4832357168197632, + "epoch": 3.48, + "learning_rate": 2.5117739403453687e-05, + "loss": 0.9395, + "step": 4121, + "task_loss": 0.7131137251853943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7502032518386841, + "epoch": 3.48, + "learning_rate": 2.511170148532786e-05, + "loss": 0.6439, + "step": 4122, + "task_loss": 0.6653632521629333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4150236248970032, + "epoch": 3.48, + "learning_rate": 2.510566356720203e-05, + "loss": 0.5345, + "step": 4123, + "task_loss": 0.35604193806648254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6976333260536194, + "epoch": 3.49, + "learning_rate": 2.50996256490762e-05, + "loss": 0.5961, + "step": 4124, + "task_loss": 0.5215651988983154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8826763033866882, + "epoch": 3.49, + "learning_rate": 2.509358773095037e-05, + "loss": 0.657, + "step": 4125, + "task_loss": 1.1694620847702026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6594715118408203, + "epoch": 3.49, + "learning_rate": 2.508754981282454e-05, + "loss": 0.5551, + "step": 4126, + "task_loss": 1.1890171766281128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38642674684524536, + "epoch": 3.49, + "learning_rate": 2.508151189469871e-05, + "loss": 0.5521, + "step": 4127, + "task_loss": 0.1710108518600464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5006185173988342, + "epoch": 3.49, + "learning_rate": 2.5075473976572878e-05, + "loss": 0.5986, + "step": 4128, + "task_loss": 0.2357504516839981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1956998109817505, + "epoch": 3.49, + "learning_rate": 2.5069436058447048e-05, + "loss": 0.7964, + "step": 4129, + "task_loss": 0.7756554484367371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6721758246421814, + "epoch": 3.49, + "learning_rate": 2.506339814032122e-05, + "loss": 0.6348, + "step": 4130, + "task_loss": 0.3779347836971283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7650978565216064, + "epoch": 3.49, + "learning_rate": 2.5057360222195386e-05, + "loss": 0.5759, + "step": 4131, + "task_loss": 1.070095419883728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46239903569221497, + "epoch": 3.49, + "learning_rate": 2.5051322304069556e-05, + "loss": 0.4727, + "step": 4132, + "task_loss": 0.3767215311527252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.588857889175415, + "epoch": 3.49, + "learning_rate": 2.504528438594373e-05, + "loss": 0.6174, + "step": 4133, + "task_loss": 0.8602174520492554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7712404727935791, + "epoch": 3.49, + "learning_rate": 2.5039246467817894e-05, + "loss": 0.6819, + "step": 4134, + "task_loss": 0.4068267047405243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48693594336509705, + "epoch": 3.5, + "learning_rate": 2.5033208549692068e-05, + "loss": 0.7122, + "step": 4135, + "task_loss": 0.4525044858455658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4844130873680115, + "epoch": 3.5, + "learning_rate": 2.502717063156624e-05, + "loss": 0.4968, + "step": 4136, + "task_loss": 0.8312535285949707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6508995294570923, + "epoch": 3.5, + "learning_rate": 2.502113271344041e-05, + "loss": 0.6318, + "step": 4137, + "task_loss": 0.5318276882171631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3770734369754791, + "epoch": 3.5, + "learning_rate": 2.5015094795314576e-05, + "loss": 0.7708, + "step": 4138, + "task_loss": 0.24721471965312958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8009540438652039, + "epoch": 3.5, + "learning_rate": 2.5009056877188747e-05, + "loss": 0.6698, + "step": 4139, + "task_loss": 0.6334355473518372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4211912155151367, + "epoch": 3.5, + "learning_rate": 2.5003018959062918e-05, + "loss": 0.7383, + "step": 4140, + "task_loss": 0.3429170548915863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2077077627182007, + "epoch": 3.5, + "learning_rate": 2.4996981040937088e-05, + "loss": 0.6292, + "step": 4141, + "task_loss": 0.6622662544250488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.071517825126648, + "epoch": 3.5, + "learning_rate": 2.4990943122811255e-05, + "loss": 0.7037, + "step": 4142, + "task_loss": 1.365580677986145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6356531381607056, + "epoch": 3.5, + "learning_rate": 2.4984905204685426e-05, + "loss": 0.6026, + "step": 4143, + "task_loss": 0.2705966532230377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6606204509735107, + "epoch": 3.5, + "learning_rate": 2.4978867286559597e-05, + "loss": 0.5695, + "step": 4144, + "task_loss": 0.6722994446754456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6121552586555481, + "epoch": 3.5, + "learning_rate": 2.4972829368433764e-05, + "loss": 0.5181, + "step": 4145, + "task_loss": 0.7211894392967224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7234222292900085, + "epoch": 3.5, + "learning_rate": 2.4966791450307934e-05, + "loss": 0.558, + "step": 4146, + "task_loss": 0.9108874201774597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6865897178649902, + "epoch": 3.51, + "learning_rate": 2.4960753532182105e-05, + "loss": 0.7021, + "step": 4147, + "task_loss": 0.9909319877624512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6469772458076477, + "epoch": 3.51, + "learning_rate": 2.4954715614056272e-05, + "loss": 0.5438, + "step": 4148, + "task_loss": 0.45413684844970703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6964613199234009, + "epoch": 3.51, + "learning_rate": 2.4948677695930446e-05, + "loss": 0.4652, + "step": 4149, + "task_loss": 0.23486928641796112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7621926069259644, + "epoch": 3.51, + "learning_rate": 2.4942639777804613e-05, + "loss": 0.6954, + "step": 4150, + "task_loss": 1.6399043798446655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3576836585998535, + "epoch": 3.51, + "learning_rate": 2.4936601859678784e-05, + "loss": 0.7718, + "step": 4151, + "task_loss": 0.20914240181446075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9007201194763184, + "epoch": 3.51, + "learning_rate": 2.4930563941552954e-05, + "loss": 0.7603, + "step": 4152, + "task_loss": 0.37844640016555786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42565685510635376, + "epoch": 3.51, + "learning_rate": 2.492452602342712e-05, + "loss": 0.584, + "step": 4153, + "task_loss": 1.0903626680374146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.780028760433197, + "epoch": 3.51, + "learning_rate": 2.4918488105301292e-05, + "loss": 0.5745, + "step": 4154, + "task_loss": 0.5820525884628296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6406984329223633, + "epoch": 3.51, + "learning_rate": 2.4912450187175463e-05, + "loss": 0.55, + "step": 4155, + "task_loss": 0.2671067416667938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47206252813339233, + "epoch": 3.51, + "learning_rate": 2.4906412269049633e-05, + "loss": 0.5297, + "step": 4156, + "task_loss": 0.6230443120002747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6872288584709167, + "epoch": 3.51, + "learning_rate": 2.4900374350923804e-05, + "loss": 0.5656, + "step": 4157, + "task_loss": 0.49943751096725464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5443592071533203, + "epoch": 3.51, + "learning_rate": 2.489433643279797e-05, + "loss": 0.5494, + "step": 4158, + "task_loss": 1.2601468563079834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9456206560134888, + "epoch": 3.52, + "learning_rate": 2.488829851467214e-05, + "loss": 0.6375, + "step": 4159, + "task_loss": 0.7289506196975708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6927809119224548, + "epoch": 3.52, + "learning_rate": 2.4882260596546312e-05, + "loss": 0.6227, + "step": 4160, + "task_loss": 0.8540127277374268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6867104768753052, + "epoch": 3.52, + "learning_rate": 2.4876222678420483e-05, + "loss": 0.7358, + "step": 4161, + "task_loss": 1.230502963066101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6967648863792419, + "epoch": 3.52, + "learning_rate": 2.487018476029465e-05, + "loss": 0.5537, + "step": 4162, + "task_loss": 1.09322190284729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5458663702011108, + "epoch": 3.52, + "learning_rate": 2.486414684216882e-05, + "loss": 0.5668, + "step": 4163, + "task_loss": 0.8900253176689148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0240482091903687, + "epoch": 3.52, + "learning_rate": 2.485810892404299e-05, + "loss": 0.9407, + "step": 4164, + "task_loss": 0.8457967042922974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6908811926841736, + "epoch": 3.52, + "learning_rate": 2.485207100591716e-05, + "loss": 0.491, + "step": 4165, + "task_loss": 1.1205745935440063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48710277676582336, + "epoch": 3.52, + "learning_rate": 2.4846033087791332e-05, + "loss": 0.5711, + "step": 4166, + "task_loss": 0.5901609063148499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.693988025188446, + "epoch": 3.52, + "learning_rate": 2.48399951696655e-05, + "loss": 0.6861, + "step": 4167, + "task_loss": 1.334979772567749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34161728620529175, + "epoch": 3.52, + "learning_rate": 2.483395725153967e-05, + "loss": 0.6088, + "step": 4168, + "task_loss": 0.22691932320594788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3767860233783722, + "epoch": 3.52, + "learning_rate": 2.482791933341384e-05, + "loss": 0.3688, + "step": 4169, + "task_loss": 0.15169915556907654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7047252058982849, + "epoch": 3.52, + "learning_rate": 2.4821881415288008e-05, + "loss": 0.5086, + "step": 4170, + "task_loss": 0.9495813846588135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37209832668304443, + "epoch": 3.53, + "learning_rate": 2.481584349716218e-05, + "loss": 0.5325, + "step": 4171, + "task_loss": 0.23220722377300262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9835602045059204, + "epoch": 3.53, + "learning_rate": 2.480980557903635e-05, + "loss": 0.6086, + "step": 4172, + "task_loss": 1.6538671255111694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3925881087779999, + "epoch": 3.53, + "learning_rate": 2.480376766091052e-05, + "loss": 0.8047, + "step": 4173, + "task_loss": 1.5235484838485718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7159826755523682, + "epoch": 3.53, + "learning_rate": 2.479772974278469e-05, + "loss": 0.6364, + "step": 4174, + "task_loss": 1.7120332717895508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4556894600391388, + "epoch": 3.53, + "learning_rate": 2.4791691824658857e-05, + "loss": 0.6396, + "step": 4175, + "task_loss": 0.4975559115409851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5380347371101379, + "epoch": 3.53, + "learning_rate": 2.478565390653303e-05, + "loss": 0.6157, + "step": 4176, + "task_loss": 0.6434682011604309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42392241954803467, + "epoch": 3.53, + "learning_rate": 2.4779615988407198e-05, + "loss": 0.6646, + "step": 4177, + "task_loss": 0.8844846487045288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24888218939304352, + "epoch": 3.53, + "learning_rate": 2.4773578070281365e-05, + "loss": 0.4635, + "step": 4178, + "task_loss": 0.40036919713020325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6721847057342529, + "epoch": 3.53, + "learning_rate": 2.476754015215554e-05, + "loss": 0.6057, + "step": 4179, + "task_loss": 1.0908523797988892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7315338253974915, + "epoch": 3.53, + "learning_rate": 2.4761502234029707e-05, + "loss": 0.5388, + "step": 4180, + "task_loss": 0.617293119430542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9069807529449463, + "epoch": 3.53, + "learning_rate": 2.4755464315903877e-05, + "loss": 0.6865, + "step": 4181, + "task_loss": 1.3291676044464111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3983303904533386, + "epoch": 3.53, + "learning_rate": 2.4749426397778048e-05, + "loss": 0.5438, + "step": 4182, + "task_loss": 1.0584784746170044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5257368087768555, + "epoch": 3.54, + "learning_rate": 2.4743388479652215e-05, + "loss": 0.6955, + "step": 4183, + "task_loss": 0.10372164845466614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43408524990081787, + "epoch": 3.54, + "learning_rate": 2.473735056152639e-05, + "loss": 0.5451, + "step": 4184, + "task_loss": 0.48534634709358215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5947525501251221, + "epoch": 3.54, + "learning_rate": 2.4731312643400556e-05, + "loss": 0.5278, + "step": 4185, + "task_loss": 1.2702536582946777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.617225170135498, + "epoch": 3.54, + "learning_rate": 2.4725274725274727e-05, + "loss": 0.5633, + "step": 4186, + "task_loss": 0.623254656791687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0065912008285522, + "epoch": 3.54, + "learning_rate": 2.4719236807148897e-05, + "loss": 0.6114, + "step": 4187, + "task_loss": 0.6516858339309692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3438969850540161, + "epoch": 3.54, + "learning_rate": 2.4713198889023064e-05, + "loss": 0.5736, + "step": 4188, + "task_loss": 0.8995336890220642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34393566846847534, + "epoch": 3.54, + "learning_rate": 2.4707160970897235e-05, + "loss": 0.6471, + "step": 4189, + "task_loss": 1.2938134670257568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8052244186401367, + "epoch": 3.54, + "learning_rate": 2.4701123052771406e-05, + "loss": 0.5769, + "step": 4190, + "task_loss": 0.7245664596557617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.513466477394104, + "epoch": 3.54, + "learning_rate": 2.4695085134645576e-05, + "loss": 0.5096, + "step": 4191, + "task_loss": 0.25636932253837585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5502128005027771, + "epoch": 3.54, + "learning_rate": 2.4689047216519747e-05, + "loss": 0.6867, + "step": 4192, + "task_loss": 1.2755722999572754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5753607153892517, + "epoch": 3.54, + "learning_rate": 2.4683009298393914e-05, + "loss": 0.5693, + "step": 4193, + "task_loss": 1.0072088241577148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9075958132743835, + "epoch": 3.54, + "learning_rate": 2.4676971380268084e-05, + "loss": 0.6451, + "step": 4194, + "task_loss": 0.2446889579296112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6129429936408997, + "epoch": 3.55, + "learning_rate": 2.4670933462142255e-05, + "loss": 0.6777, + "step": 4195, + "task_loss": 1.3101462125778198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8324920535087585, + "epoch": 3.55, + "learning_rate": 2.4664895544016426e-05, + "loss": 0.4754, + "step": 4196, + "task_loss": 0.821843147277832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5216440558433533, + "epoch": 3.55, + "learning_rate": 2.4658857625890593e-05, + "loss": 0.4818, + "step": 4197, + "task_loss": 0.6869677305221558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3029025197029114, + "epoch": 3.55, + "learning_rate": 2.4652819707764763e-05, + "loss": 0.6883, + "step": 4198, + "task_loss": 0.9774749875068665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7073577642440796, + "epoch": 3.55, + "learning_rate": 2.4646781789638934e-05, + "loss": 0.608, + "step": 4199, + "task_loss": 1.3318475484848022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6072676181793213, + "epoch": 3.55, + "learning_rate": 2.4640743871513104e-05, + "loss": 0.5624, + "step": 4200, + "task_loss": 0.6497167348861694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7979310750961304, + "epoch": 3.55, + "learning_rate": 2.4634705953387275e-05, + "loss": 0.4511, + "step": 4201, + "task_loss": 0.6426718235015869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7035356760025024, + "epoch": 3.55, + "learning_rate": 2.4628668035261442e-05, + "loss": 0.5594, + "step": 4202, + "task_loss": 0.5779056549072266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4713970422744751, + "epoch": 3.55, + "learning_rate": 2.4622630117135613e-05, + "loss": 0.6405, + "step": 4203, + "task_loss": 0.3309163749217987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.3479150533676147, + "epoch": 3.55, + "learning_rate": 2.4616592199009783e-05, + "loss": 0.8725, + "step": 4204, + "task_loss": 0.9867119193077087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44606366753578186, + "epoch": 3.55, + "learning_rate": 2.461055428088395e-05, + "loss": 0.5515, + "step": 4205, + "task_loss": 0.6819097399711609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8505806922912598, + "epoch": 3.56, + "learning_rate": 2.4604516362758124e-05, + "loss": 0.6598, + "step": 4206, + "task_loss": 0.9419381618499756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7018644213676453, + "epoch": 3.56, + "learning_rate": 2.459847844463229e-05, + "loss": 0.7172, + "step": 4207, + "task_loss": 0.3468632400035858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7288522124290466, + "epoch": 3.56, + "learning_rate": 2.4592440526506462e-05, + "loss": 0.5848, + "step": 4208, + "task_loss": 0.7241808772087097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5328664183616638, + "epoch": 3.56, + "learning_rate": 2.4586402608380633e-05, + "loss": 0.5549, + "step": 4209, + "task_loss": 0.7984256148338318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3230014741420746, + "epoch": 3.56, + "learning_rate": 2.45803646902548e-05, + "loss": 0.5678, + "step": 4210, + "task_loss": 0.19243711233139038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5487404465675354, + "epoch": 3.56, + "learning_rate": 2.457432677212897e-05, + "loss": 0.6123, + "step": 4211, + "task_loss": 0.850402295589447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6502159237861633, + "epoch": 3.56, + "learning_rate": 2.456828885400314e-05, + "loss": 0.5805, + "step": 4212, + "task_loss": 0.8755824565887451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5028156042098999, + "epoch": 3.56, + "learning_rate": 2.456225093587731e-05, + "loss": 0.5127, + "step": 4213, + "task_loss": 0.5019248723983765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.399415522813797, + "epoch": 3.56, + "learning_rate": 2.4556213017751482e-05, + "loss": 0.5287, + "step": 4214, + "task_loss": 0.9000704884529114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3839940130710602, + "epoch": 3.56, + "learning_rate": 2.455017509962565e-05, + "loss": 0.7481, + "step": 4215, + "task_loss": 0.6470539569854736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48718535900115967, + "epoch": 3.56, + "learning_rate": 2.454413718149982e-05, + "loss": 0.4779, + "step": 4216, + "task_loss": 0.6259360909461975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3314751386642456, + "epoch": 3.56, + "learning_rate": 2.453809926337399e-05, + "loss": 0.4968, + "step": 4217, + "task_loss": 0.7570702433586121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4707067012786865, + "epoch": 3.57, + "learning_rate": 2.4532061345248158e-05, + "loss": 0.642, + "step": 4218, + "task_loss": 0.5359216332435608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7616022825241089, + "epoch": 3.57, + "learning_rate": 2.452602342712233e-05, + "loss": 0.622, + "step": 4219, + "task_loss": 1.0238831043243408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4238653779029846, + "epoch": 3.57, + "learning_rate": 2.45199855089965e-05, + "loss": 0.7518, + "step": 4220, + "task_loss": 0.9566068053245544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23930177092552185, + "epoch": 3.57, + "learning_rate": 2.451394759087067e-05, + "loss": 0.586, + "step": 4221, + "task_loss": 0.08897644281387329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49163293838500977, + "epoch": 3.57, + "learning_rate": 2.450790967274484e-05, + "loss": 0.476, + "step": 4222, + "task_loss": 0.846335768699646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6995770931243896, + "epoch": 3.57, + "learning_rate": 2.4501871754619007e-05, + "loss": 0.4632, + "step": 4223, + "task_loss": 1.1655348539352417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1134285926818848, + "epoch": 3.57, + "learning_rate": 2.4495833836493178e-05, + "loss": 0.6538, + "step": 4224, + "task_loss": 1.3200987577438354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6507658958435059, + "epoch": 3.57, + "learning_rate": 2.448979591836735e-05, + "loss": 0.7354, + "step": 4225, + "task_loss": 1.245566725730896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45306989550590515, + "epoch": 3.57, + "learning_rate": 2.448375800024152e-05, + "loss": 0.5751, + "step": 4226, + "task_loss": 0.9382508397102356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2857702970504761, + "epoch": 3.57, + "learning_rate": 2.4477720082115686e-05, + "loss": 0.5196, + "step": 4227, + "task_loss": 0.46433836221694946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5763422250747681, + "epoch": 3.57, + "learning_rate": 2.4471682163989857e-05, + "loss": 0.5633, + "step": 4228, + "task_loss": 0.3643262982368469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6482219696044922, + "epoch": 3.57, + "learning_rate": 2.4465644245864027e-05, + "loss": 0.8671, + "step": 4229, + "task_loss": 1.3384655714035034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6682378053665161, + "epoch": 3.58, + "learning_rate": 2.4459606327738198e-05, + "loss": 0.5398, + "step": 4230, + "task_loss": 0.7895174622535706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4206162691116333, + "epoch": 3.58, + "learning_rate": 2.445356840961237e-05, + "loss": 0.5475, + "step": 4231, + "task_loss": 0.5694159865379333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4416455626487732, + "epoch": 3.58, + "learning_rate": 2.4447530491486536e-05, + "loss": 0.6186, + "step": 4232, + "task_loss": 0.8253732323646545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6457151770591736, + "epoch": 3.58, + "learning_rate": 2.4441492573360706e-05, + "loss": 0.523, + "step": 4233, + "task_loss": 0.9314072728157043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35384833812713623, + "epoch": 3.58, + "learning_rate": 2.4435454655234877e-05, + "loss": 0.4445, + "step": 4234, + "task_loss": 0.4616059958934784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8133848905563354, + "epoch": 3.58, + "learning_rate": 2.4429416737109044e-05, + "loss": 0.548, + "step": 4235, + "task_loss": 0.6348656415939331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3496147096157074, + "epoch": 3.58, + "learning_rate": 2.4423378818983218e-05, + "loss": 0.4827, + "step": 4236, + "task_loss": 0.0861474797129631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8831364512443542, + "epoch": 3.58, + "learning_rate": 2.4417340900857385e-05, + "loss": 0.7353, + "step": 4237, + "task_loss": 1.387492299079895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3587501049041748, + "epoch": 3.58, + "learning_rate": 2.4411302982731556e-05, + "loss": 0.6289, + "step": 4238, + "task_loss": 0.9814375638961792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5531660318374634, + "epoch": 3.58, + "learning_rate": 2.4405265064605726e-05, + "loss": 0.5689, + "step": 4239, + "task_loss": 0.43527480959892273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7911735773086548, + "epoch": 3.58, + "learning_rate": 2.4399227146479893e-05, + "loss": 0.6207, + "step": 4240, + "task_loss": 0.9894632697105408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49902456998825073, + "epoch": 3.58, + "learning_rate": 2.4393189228354067e-05, + "loss": 0.65, + "step": 4241, + "task_loss": 0.7161417603492737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5667104721069336, + "epoch": 3.59, + "learning_rate": 2.4387151310228235e-05, + "loss": 0.7315, + "step": 4242, + "task_loss": 1.3169701099395752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7981175184249878, + "epoch": 3.59, + "learning_rate": 2.4381113392102402e-05, + "loss": 0.6645, + "step": 4243, + "task_loss": 0.4979659616947174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5441055297851562, + "epoch": 3.59, + "learning_rate": 2.4375075473976576e-05, + "loss": 0.513, + "step": 4244, + "task_loss": 0.9403365254402161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31866455078125, + "epoch": 3.59, + "learning_rate": 2.4369037555850743e-05, + "loss": 0.5917, + "step": 4245, + "task_loss": 0.17901165783405304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.683908998966217, + "epoch": 3.59, + "learning_rate": 2.4362999637724913e-05, + "loss": 0.5988, + "step": 4246, + "task_loss": 0.3908775746822357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46619880199432373, + "epoch": 3.59, + "learning_rate": 2.4356961719599084e-05, + "loss": 0.5163, + "step": 4247, + "task_loss": 0.901326060295105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4998355209827423, + "epoch": 3.59, + "learning_rate": 2.435092380147325e-05, + "loss": 0.5721, + "step": 4248, + "task_loss": 0.29014265537261963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6405777931213379, + "epoch": 3.59, + "learning_rate": 2.4344885883347425e-05, + "loss": 0.6047, + "step": 4249, + "task_loss": 1.243961215019226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4013212323188782, + "epoch": 3.59, + "learning_rate": 2.4338847965221592e-05, + "loss": 0.4176, + "step": 4250, + "task_loss": 0.6892869472503662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7979017496109009, + "epoch": 3.59, + "learning_rate": 2.4332810047095763e-05, + "loss": 0.8838, + "step": 4251, + "task_loss": 0.487253338098526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5379495620727539, + "epoch": 3.59, + "learning_rate": 2.4326772128969933e-05, + "loss": 0.5005, + "step": 4252, + "task_loss": 0.7251278162002563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0675206184387207, + "epoch": 3.59, + "learning_rate": 2.43207342108441e-05, + "loss": 0.7608, + "step": 4253, + "task_loss": 2.178403377532959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3482409119606018, + "epoch": 3.6, + "learning_rate": 2.431469629271827e-05, + "loss": 0.6384, + "step": 4254, + "task_loss": 0.7409291863441467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.393201619386673, + "epoch": 3.6, + "learning_rate": 2.4308658374592442e-05, + "loss": 0.5204, + "step": 4255, + "task_loss": 0.5000784397125244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8834972381591797, + "epoch": 3.6, + "learning_rate": 2.4302620456466612e-05, + "loss": 0.6308, + "step": 4256, + "task_loss": 1.3259788751602173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0330417156219482, + "epoch": 3.6, + "learning_rate": 2.4296582538340783e-05, + "loss": 0.6635, + "step": 4257, + "task_loss": 1.0197449922561646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23399166762828827, + "epoch": 3.6, + "learning_rate": 2.429054462021495e-05, + "loss": 0.4484, + "step": 4258, + "task_loss": 0.23665161430835724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6208016872406006, + "epoch": 3.6, + "learning_rate": 2.428450670208912e-05, + "loss": 0.669, + "step": 4259, + "task_loss": 0.9160673022270203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8655545711517334, + "epoch": 3.6, + "learning_rate": 2.427846878396329e-05, + "loss": 0.6719, + "step": 4260, + "task_loss": 1.0783336162567139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37337255477905273, + "epoch": 3.6, + "learning_rate": 2.4272430865837462e-05, + "loss": 0.4706, + "step": 4261, + "task_loss": 0.6342410445213318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48439961671829224, + "epoch": 3.6, + "learning_rate": 2.426639294771163e-05, + "loss": 0.7941, + "step": 4262, + "task_loss": 0.46786925196647644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5604270100593567, + "epoch": 3.6, + "learning_rate": 2.42603550295858e-05, + "loss": 0.6856, + "step": 4263, + "task_loss": 0.7858852744102478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26819878816604614, + "epoch": 3.6, + "learning_rate": 2.425431711145997e-05, + "loss": 0.413, + "step": 4264, + "task_loss": 0.07641857117414474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27646902203559875, + "epoch": 3.6, + "learning_rate": 2.424827919333414e-05, + "loss": 0.4224, + "step": 4265, + "task_loss": 0.34880775213241577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4637950360774994, + "epoch": 3.61, + "learning_rate": 2.424224127520831e-05, + "loss": 0.4939, + "step": 4266, + "task_loss": 0.3946613073348999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6749200820922852, + "epoch": 3.61, + "learning_rate": 2.423620335708248e-05, + "loss": 0.6107, + "step": 4267, + "task_loss": 1.0687825679779053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6110369563102722, + "epoch": 3.61, + "learning_rate": 2.423016543895665e-05, + "loss": 0.6363, + "step": 4268, + "task_loss": 1.0181410312652588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36688047647476196, + "epoch": 3.61, + "learning_rate": 2.422412752083082e-05, + "loss": 0.6073, + "step": 4269, + "task_loss": 0.29250049591064453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45256829261779785, + "epoch": 3.61, + "learning_rate": 2.4218089602704987e-05, + "loss": 0.4911, + "step": 4270, + "task_loss": 0.19085165858268738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6192053556442261, + "epoch": 3.61, + "learning_rate": 2.421205168457916e-05, + "loss": 0.6192, + "step": 4271, + "task_loss": 1.6141718626022339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5643174648284912, + "epoch": 3.61, + "learning_rate": 2.4206013766453328e-05, + "loss": 0.7307, + "step": 4272, + "task_loss": 1.378890872001648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4509198069572449, + "epoch": 3.61, + "learning_rate": 2.41999758483275e-05, + "loss": 0.5689, + "step": 4273, + "task_loss": 0.8678677678108215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36257609724998474, + "epoch": 3.61, + "learning_rate": 2.419393793020167e-05, + "loss": 0.6665, + "step": 4274, + "task_loss": 0.8630626797676086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44464147090911865, + "epoch": 3.61, + "learning_rate": 2.4187900012075836e-05, + "loss": 0.5083, + "step": 4275, + "task_loss": 0.750762939453125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9607479572296143, + "epoch": 3.61, + "learning_rate": 2.4181862093950007e-05, + "loss": 0.6782, + "step": 4276, + "task_loss": 0.6660279035568237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6897210478782654, + "epoch": 3.61, + "learning_rate": 2.4175824175824177e-05, + "loss": 0.5142, + "step": 4277, + "task_loss": 0.6546536684036255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5398198366165161, + "epoch": 3.62, + "learning_rate": 2.4169786257698345e-05, + "loss": 0.4464, + "step": 4278, + "task_loss": 0.35818231105804443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48610928654670715, + "epoch": 3.62, + "learning_rate": 2.416374833957252e-05, + "loss": 0.5843, + "step": 4279, + "task_loss": 0.5175728797912598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7794807553291321, + "epoch": 3.62, + "learning_rate": 2.4157710421446686e-05, + "loss": 0.5415, + "step": 4280, + "task_loss": 0.6003097891807556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9873936176300049, + "epoch": 3.62, + "learning_rate": 2.4151672503320856e-05, + "loss": 0.6864, + "step": 4281, + "task_loss": 1.0064197778701782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7737504839897156, + "epoch": 3.62, + "learning_rate": 2.4145634585195027e-05, + "loss": 0.7493, + "step": 4282, + "task_loss": 0.8891194462776184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25292620062828064, + "epoch": 3.62, + "learning_rate": 2.4139596667069194e-05, + "loss": 0.5422, + "step": 4283, + "task_loss": 0.37782716751098633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8298013210296631, + "epoch": 3.62, + "learning_rate": 2.4133558748943365e-05, + "loss": 0.5606, + "step": 4284, + "task_loss": 1.0094630718231201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6931698322296143, + "epoch": 3.62, + "learning_rate": 2.4127520830817535e-05, + "loss": 0.6745, + "step": 4285, + "task_loss": 0.5029622912406921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.884263277053833, + "epoch": 3.62, + "learning_rate": 2.4121482912691706e-05, + "loss": 0.5401, + "step": 4286, + "task_loss": 0.5203794240951538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3827900290489197, + "epoch": 3.62, + "learning_rate": 2.4115444994565876e-05, + "loss": 0.491, + "step": 4287, + "task_loss": 0.26666778326034546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26403582096099854, + "epoch": 3.62, + "learning_rate": 2.4109407076440044e-05, + "loss": 0.4339, + "step": 4288, + "task_loss": 0.7801306247711182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6067962050437927, + "epoch": 3.63, + "learning_rate": 2.4103369158314214e-05, + "loss": 0.6043, + "step": 4289, + "task_loss": 0.8797191381454468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5694129467010498, + "epoch": 3.63, + "learning_rate": 2.4097331240188385e-05, + "loss": 0.49, + "step": 4290, + "task_loss": 0.9478297829627991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.300503671169281, + "epoch": 3.63, + "learning_rate": 2.4091293322062555e-05, + "loss": 0.8414, + "step": 4291, + "task_loss": 0.35614240169525146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8378791809082031, + "epoch": 3.63, + "learning_rate": 2.4085255403936722e-05, + "loss": 0.5339, + "step": 4292, + "task_loss": 0.8773441314697266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6370061635971069, + "epoch": 3.63, + "learning_rate": 2.4079217485810893e-05, + "loss": 0.6115, + "step": 4293, + "task_loss": 1.521229863166809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7086033821105957, + "epoch": 3.63, + "learning_rate": 2.4073179567685064e-05, + "loss": 0.602, + "step": 4294, + "task_loss": 0.9678382873535156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43517589569091797, + "epoch": 3.63, + "learning_rate": 2.4067141649559234e-05, + "loss": 0.5901, + "step": 4295, + "task_loss": 0.6506100296974182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4611397981643677, + "epoch": 3.63, + "learning_rate": 2.4061103731433405e-05, + "loss": 0.4927, + "step": 4296, + "task_loss": 0.5989790558815002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3266776502132416, + "epoch": 3.63, + "learning_rate": 2.4055065813307572e-05, + "loss": 0.4543, + "step": 4297, + "task_loss": 0.46925950050354004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5356259346008301, + "epoch": 3.63, + "learning_rate": 2.4049027895181742e-05, + "loss": 0.6457, + "step": 4298, + "task_loss": 1.395267128944397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39364925026893616, + "epoch": 3.63, + "learning_rate": 2.4042989977055913e-05, + "loss": 0.4223, + "step": 4299, + "task_loss": 0.500012993812561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46568095684051514, + "epoch": 3.63, + "learning_rate": 2.403695205893008e-05, + "loss": 0.6649, + "step": 4300, + "task_loss": 1.2760881185531616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5615409016609192, + "epoch": 3.64, + "learning_rate": 2.4030914140804254e-05, + "loss": 0.5653, + "step": 4301, + "task_loss": 0.6852695941925049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5643090009689331, + "epoch": 3.64, + "learning_rate": 2.402487622267842e-05, + "loss": 0.6068, + "step": 4302, + "task_loss": 0.6581242084503174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5467841625213623, + "epoch": 3.64, + "learning_rate": 2.4018838304552592e-05, + "loss": 0.5659, + "step": 4303, + "task_loss": 1.1785695552825928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5230387449264526, + "epoch": 3.64, + "learning_rate": 2.4012800386426763e-05, + "loss": 0.6225, + "step": 4304, + "task_loss": 0.6625786423683167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7795270681381226, + "epoch": 3.64, + "learning_rate": 2.400676246830093e-05, + "loss": 0.7036, + "step": 4305, + "task_loss": 1.3561177253723145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5388163328170776, + "epoch": 3.64, + "learning_rate": 2.4000724550175104e-05, + "loss": 0.5409, + "step": 4306, + "task_loss": 0.2527400851249695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6517503261566162, + "epoch": 3.64, + "learning_rate": 2.399468663204927e-05, + "loss": 0.4718, + "step": 4307, + "task_loss": 0.7020302414894104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3922607898712158, + "epoch": 3.64, + "learning_rate": 2.3988648713923438e-05, + "loss": 0.558, + "step": 4308, + "task_loss": 0.33792805671691895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2505198121070862, + "epoch": 3.64, + "learning_rate": 2.3982610795797612e-05, + "loss": 0.7499, + "step": 4309, + "task_loss": 1.4245668649673462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2577147483825684, + "epoch": 3.64, + "learning_rate": 2.397657287767178e-05, + "loss": 0.7844, + "step": 4310, + "task_loss": 1.540686845779419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9071505665779114, + "epoch": 3.64, + "learning_rate": 2.397053495954595e-05, + "loss": 0.7144, + "step": 4311, + "task_loss": 0.9646505117416382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8620690107345581, + "epoch": 3.64, + "learning_rate": 2.396449704142012e-05, + "loss": 0.5949, + "step": 4312, + "task_loss": 0.9074426293373108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44265222549438477, + "epoch": 3.65, + "learning_rate": 2.3958459123294287e-05, + "loss": 0.6603, + "step": 4313, + "task_loss": 0.4282679557800293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5303137302398682, + "epoch": 3.65, + "learning_rate": 2.395242120516846e-05, + "loss": 0.6798, + "step": 4314, + "task_loss": 0.6319541931152344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7429466247558594, + "epoch": 3.65, + "learning_rate": 2.394638328704263e-05, + "loss": 0.532, + "step": 4315, + "task_loss": 1.08628249168396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5437277555465698, + "epoch": 3.65, + "learning_rate": 2.39403453689168e-05, + "loss": 0.6242, + "step": 4316, + "task_loss": 0.6026431918144226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5413227081298828, + "epoch": 3.65, + "learning_rate": 2.393430745079097e-05, + "loss": 0.5093, + "step": 4317, + "task_loss": 1.40569269657135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6359764933586121, + "epoch": 3.65, + "learning_rate": 2.3928269532665137e-05, + "loss": 0.5781, + "step": 4318, + "task_loss": 0.6995397210121155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3550418019294739, + "epoch": 3.65, + "learning_rate": 2.3922231614539308e-05, + "loss": 0.5399, + "step": 4319, + "task_loss": 0.40978264808654785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.592278003692627, + "epoch": 3.65, + "learning_rate": 2.3916193696413478e-05, + "loss": 0.4934, + "step": 4320, + "task_loss": 1.0277175903320312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3520861566066742, + "epoch": 3.65, + "learning_rate": 2.3910155778287645e-05, + "loss": 0.5322, + "step": 4321, + "task_loss": 0.9977183938026428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4137224853038788, + "epoch": 3.65, + "learning_rate": 2.390411786016182e-05, + "loss": 0.6491, + "step": 4322, + "task_loss": 0.23087714612483978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.709358811378479, + "epoch": 3.65, + "learning_rate": 2.3898079942035986e-05, + "loss": 0.7269, + "step": 4323, + "task_loss": 1.2134037017822266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4022415578365326, + "epoch": 3.65, + "learning_rate": 2.3892042023910157e-05, + "loss": 0.5052, + "step": 4324, + "task_loss": 0.5777286291122437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24018356204032898, + "epoch": 3.66, + "learning_rate": 2.3886004105784328e-05, + "loss": 0.6199, + "step": 4325, + "task_loss": 0.042449068278074265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3976602554321289, + "epoch": 3.66, + "learning_rate": 2.3879966187658495e-05, + "loss": 0.5082, + "step": 4326, + "task_loss": 0.9932011961936951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42278140783309937, + "epoch": 3.66, + "learning_rate": 2.3873928269532665e-05, + "loss": 0.5369, + "step": 4327, + "task_loss": 0.45321857929229736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41785335540771484, + "epoch": 3.66, + "learning_rate": 2.3867890351406836e-05, + "loss": 0.5708, + "step": 4328, + "task_loss": 0.341041624546051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37487316131591797, + "epoch": 3.66, + "learning_rate": 2.3861852433281006e-05, + "loss": 0.4989, + "step": 4329, + "task_loss": 0.4843595027923584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7759889960289001, + "epoch": 3.66, + "learning_rate": 2.3855814515155177e-05, + "loss": 0.6312, + "step": 4330, + "task_loss": 0.7479853630065918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5521308183670044, + "epoch": 3.66, + "learning_rate": 2.3849776597029344e-05, + "loss": 0.5694, + "step": 4331, + "task_loss": 0.15640904009342194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3748498857021332, + "epoch": 3.66, + "learning_rate": 2.3843738678903515e-05, + "loss": 0.3973, + "step": 4332, + "task_loss": 0.5818940997123718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8559672832489014, + "epoch": 3.66, + "learning_rate": 2.3837700760777685e-05, + "loss": 0.6637, + "step": 4333, + "task_loss": 0.4340634047985077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5266093611717224, + "epoch": 3.66, + "learning_rate": 2.3831662842651856e-05, + "loss": 0.451, + "step": 4334, + "task_loss": 1.163358449935913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46888086199760437, + "epoch": 3.66, + "learning_rate": 2.3825624924526023e-05, + "loss": 0.6834, + "step": 4335, + "task_loss": 0.6721116900444031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3285461962223053, + "epoch": 3.66, + "learning_rate": 2.3819587006400194e-05, + "loss": 0.4003, + "step": 4336, + "task_loss": 0.1277686357498169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33291923999786377, + "epoch": 3.67, + "learning_rate": 2.3813549088274364e-05, + "loss": 0.7039, + "step": 4337, + "task_loss": 0.4311257302761078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5239934921264648, + "epoch": 3.67, + "learning_rate": 2.3807511170148535e-05, + "loss": 0.6426, + "step": 4338, + "task_loss": 0.6429594159126282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5139381885528564, + "epoch": 3.67, + "learning_rate": 2.3801473252022705e-05, + "loss": 0.5274, + "step": 4339, + "task_loss": 0.24639888107776642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.789879322052002, + "epoch": 3.67, + "learning_rate": 2.3795435333896873e-05, + "loss": 0.6836, + "step": 4340, + "task_loss": 0.5346865057945251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4358922243118286, + "epoch": 3.67, + "learning_rate": 2.3789397415771043e-05, + "loss": 0.6389, + "step": 4341, + "task_loss": 0.3873177170753479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3415587544441223, + "epoch": 3.67, + "learning_rate": 2.3783359497645214e-05, + "loss": 0.6568, + "step": 4342, + "task_loss": 0.32899007201194763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.65337073802948, + "epoch": 3.67, + "learning_rate": 2.377732157951938e-05, + "loss": 0.6593, + "step": 4343, + "task_loss": 0.5241606831550598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41692328453063965, + "epoch": 3.67, + "learning_rate": 2.3771283661393555e-05, + "loss": 0.5744, + "step": 4344, + "task_loss": 1.3385401964187622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38485807180404663, + "epoch": 3.67, + "learning_rate": 2.3765245743267722e-05, + "loss": 0.4901, + "step": 4345, + "task_loss": 0.2238738238811493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5712052583694458, + "epoch": 3.67, + "learning_rate": 2.375920782514189e-05, + "loss": 0.592, + "step": 4346, + "task_loss": 0.6720612049102783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44998031854629517, + "epoch": 3.67, + "learning_rate": 2.3753169907016063e-05, + "loss": 0.4704, + "step": 4347, + "task_loss": 0.42162176966667175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.570525050163269, + "epoch": 3.67, + "learning_rate": 2.374713198889023e-05, + "loss": 0.7058, + "step": 4348, + "task_loss": 0.3019949793815613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9714746475219727, + "epoch": 3.68, + "learning_rate": 2.37410940707644e-05, + "loss": 0.6358, + "step": 4349, + "task_loss": 1.2269765138626099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.665420651435852, + "epoch": 3.68, + "learning_rate": 2.373505615263857e-05, + "loss": 0.4446, + "step": 4350, + "task_loss": 0.7301895618438721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7116296291351318, + "epoch": 3.68, + "learning_rate": 2.372901823451274e-05, + "loss": 0.6218, + "step": 4351, + "task_loss": 0.6280571818351746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8663284182548523, + "epoch": 3.68, + "learning_rate": 2.3722980316386913e-05, + "loss": 0.7794, + "step": 4352, + "task_loss": 0.8003362417221069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9489040374755859, + "epoch": 3.68, + "learning_rate": 2.371694239826108e-05, + "loss": 0.8086, + "step": 4353, + "task_loss": 1.1657168865203857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0347764492034912, + "epoch": 3.68, + "learning_rate": 2.371090448013525e-05, + "loss": 0.72, + "step": 4354, + "task_loss": 1.3878023624420166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6099624037742615, + "epoch": 3.68, + "learning_rate": 2.370486656200942e-05, + "loss": 0.725, + "step": 4355, + "task_loss": 1.264418363571167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5760116577148438, + "epoch": 3.68, + "learning_rate": 2.3698828643883588e-05, + "loss": 0.6533, + "step": 4356, + "task_loss": 1.23392915725708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5859034061431885, + "epoch": 3.68, + "learning_rate": 2.369279072575776e-05, + "loss": 0.7091, + "step": 4357, + "task_loss": 0.6661136150360107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6494975090026855, + "epoch": 3.68, + "learning_rate": 2.368675280763193e-05, + "loss": 0.4674, + "step": 4358, + "task_loss": 0.6247580051422119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48702794313430786, + "epoch": 3.68, + "learning_rate": 2.36807148895061e-05, + "loss": 0.4968, + "step": 4359, + "task_loss": 0.785739004611969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7734709978103638, + "epoch": 3.69, + "learning_rate": 2.367467697138027e-05, + "loss": 0.5379, + "step": 4360, + "task_loss": 0.4439743161201477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3113316297531128, + "epoch": 3.69, + "learning_rate": 2.3668639053254438e-05, + "loss": 0.493, + "step": 4361, + "task_loss": 0.6056875586509705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4885111451148987, + "epoch": 3.69, + "learning_rate": 2.3662601135128608e-05, + "loss": 0.5453, + "step": 4362, + "task_loss": 1.021718978881836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7119765281677246, + "epoch": 3.69, + "learning_rate": 2.365656321700278e-05, + "loss": 0.6081, + "step": 4363, + "task_loss": 0.9250222444534302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5379183292388916, + "epoch": 3.69, + "learning_rate": 2.365052529887695e-05, + "loss": 0.5123, + "step": 4364, + "task_loss": 0.29842841625213623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7729349732398987, + "epoch": 3.69, + "learning_rate": 2.3644487380751117e-05, + "loss": 0.6453, + "step": 4365, + "task_loss": 0.8580528497695923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3468550443649292, + "epoch": 3.69, + "learning_rate": 2.3638449462625287e-05, + "loss": 0.6883, + "step": 4366, + "task_loss": 0.862120509147644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7004908323287964, + "epoch": 3.69, + "learning_rate": 2.3632411544499458e-05, + "loss": 0.7006, + "step": 4367, + "task_loss": 0.7497504949569702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5857583284378052, + "epoch": 3.69, + "learning_rate": 2.3626373626373628e-05, + "loss": 0.5364, + "step": 4368, + "task_loss": 1.0275447368621826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5849742889404297, + "epoch": 3.69, + "learning_rate": 2.36203357082478e-05, + "loss": 0.529, + "step": 4369, + "task_loss": 0.7624648809432983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4179826080799103, + "epoch": 3.69, + "learning_rate": 2.3614297790121966e-05, + "loss": 0.6487, + "step": 4370, + "task_loss": 0.5821127891540527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6383635401725769, + "epoch": 3.69, + "learning_rate": 2.3608259871996137e-05, + "loss": 0.6324, + "step": 4371, + "task_loss": 0.12653309106826782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5964045524597168, + "epoch": 3.7, + "learning_rate": 2.3602221953870307e-05, + "loss": 0.4753, + "step": 4372, + "task_loss": 1.12154221534729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9079617261886597, + "epoch": 3.7, + "learning_rate": 2.3596184035744474e-05, + "loss": 0.6292, + "step": 4373, + "task_loss": 0.9361488223075867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7502180337905884, + "epoch": 3.7, + "learning_rate": 2.3590146117618648e-05, + "loss": 0.6016, + "step": 4374, + "task_loss": 0.49215951561927795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6395629644393921, + "epoch": 3.7, + "learning_rate": 2.3584108199492815e-05, + "loss": 0.6775, + "step": 4375, + "task_loss": 1.8227969408035278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3942953646183014, + "epoch": 3.7, + "learning_rate": 2.3578070281366986e-05, + "loss": 0.7455, + "step": 4376, + "task_loss": 0.6180973052978516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6891099810600281, + "epoch": 3.7, + "learning_rate": 2.3572032363241157e-05, + "loss": 0.6206, + "step": 4377, + "task_loss": 1.15162193775177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6132393479347229, + "epoch": 3.7, + "learning_rate": 2.3565994445115324e-05, + "loss": 0.6412, + "step": 4378, + "task_loss": 1.0419695377349854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.607231080532074, + "epoch": 3.7, + "learning_rate": 2.3559956526989498e-05, + "loss": 0.6849, + "step": 4379, + "task_loss": 0.3432684540748596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6909276247024536, + "epoch": 3.7, + "learning_rate": 2.3553918608863665e-05, + "loss": 0.6988, + "step": 4380, + "task_loss": 0.897251546382904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37630608677864075, + "epoch": 3.7, + "learning_rate": 2.3547880690737832e-05, + "loss": 0.5215, + "step": 4381, + "task_loss": 0.1891927868127823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3235810399055481, + "epoch": 3.7, + "learning_rate": 2.3541842772612006e-05, + "loss": 0.6792, + "step": 4382, + "task_loss": 0.13544581830501556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8620448112487793, + "epoch": 3.7, + "learning_rate": 2.3535804854486173e-05, + "loss": 0.7341, + "step": 4383, + "task_loss": 0.4014430046081543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7867333889007568, + "epoch": 3.71, + "learning_rate": 2.3529766936360344e-05, + "loss": 0.6159, + "step": 4384, + "task_loss": 0.3357470631599426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6868933439254761, + "epoch": 3.71, + "learning_rate": 2.3523729018234514e-05, + "loss": 0.6366, + "step": 4385, + "task_loss": 1.1039340496063232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8148773312568665, + "epoch": 3.71, + "learning_rate": 2.351769110010868e-05, + "loss": 0.5945, + "step": 4386, + "task_loss": 0.4654320180416107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44183096289634705, + "epoch": 3.71, + "learning_rate": 2.3511653181982856e-05, + "loss": 0.5496, + "step": 4387, + "task_loss": 0.8588679432868958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.666212260723114, + "epoch": 3.71, + "learning_rate": 2.3505615263857023e-05, + "loss": 0.5312, + "step": 4388, + "task_loss": 0.1847188025712967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9916696548461914, + "epoch": 3.71, + "learning_rate": 2.3499577345731193e-05, + "loss": 0.7238, + "step": 4389, + "task_loss": 1.6854559183120728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19853267073631287, + "epoch": 3.71, + "learning_rate": 2.3493539427605364e-05, + "loss": 0.4684, + "step": 4390, + "task_loss": 0.10029415041208267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3769868016242981, + "epoch": 3.71, + "learning_rate": 2.348750150947953e-05, + "loss": 0.6432, + "step": 4391, + "task_loss": 0.9148777723312378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4568653106689453, + "epoch": 3.71, + "learning_rate": 2.34814635913537e-05, + "loss": 0.6523, + "step": 4392, + "task_loss": 1.2455281019210815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45522409677505493, + "epoch": 3.71, + "learning_rate": 2.3475425673227872e-05, + "loss": 0.6375, + "step": 4393, + "task_loss": 0.24589979648590088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6949620246887207, + "epoch": 3.71, + "learning_rate": 2.3469387755102043e-05, + "loss": 0.6419, + "step": 4394, + "task_loss": 1.5485948324203491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49712884426116943, + "epoch": 3.71, + "learning_rate": 2.3463349836976213e-05, + "loss": 0.6477, + "step": 4395, + "task_loss": 0.7489649057388306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3558977246284485, + "epoch": 3.72, + "learning_rate": 2.345731191885038e-05, + "loss": 0.5066, + "step": 4396, + "task_loss": 0.4252975583076477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.845852255821228, + "epoch": 3.72, + "learning_rate": 2.345127400072455e-05, + "loss": 0.6293, + "step": 4397, + "task_loss": 1.162513017654419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0114325284957886, + "epoch": 3.72, + "learning_rate": 2.344523608259872e-05, + "loss": 0.6334, + "step": 4398, + "task_loss": 0.7668378353118896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7894441485404968, + "epoch": 3.72, + "learning_rate": 2.3439198164472892e-05, + "loss": 0.6698, + "step": 4399, + "task_loss": 1.1614115238189697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5702123045921326, + "epoch": 3.72, + "learning_rate": 2.343316024634706e-05, + "loss": 0.5268, + "step": 4400, + "task_loss": 0.32320210337638855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7098827362060547, + "epoch": 3.72, + "learning_rate": 2.342712232822123e-05, + "loss": 0.6034, + "step": 4401, + "task_loss": 1.8859773874282837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3972153961658478, + "epoch": 3.72, + "learning_rate": 2.34210844100954e-05, + "loss": 0.6023, + "step": 4402, + "task_loss": 0.6027222871780396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3747714161872864, + "epoch": 3.72, + "learning_rate": 2.341504649196957e-05, + "loss": 0.4614, + "step": 4403, + "task_loss": 0.41247859597206116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4844101071357727, + "epoch": 3.72, + "learning_rate": 2.3409008573843742e-05, + "loss": 0.5485, + "step": 4404, + "task_loss": 0.36870938539505005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4122719466686249, + "epoch": 3.72, + "learning_rate": 2.340297065571791e-05, + "loss": 0.4344, + "step": 4405, + "task_loss": 0.09296884387731552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.606812596321106, + "epoch": 3.72, + "learning_rate": 2.339693273759208e-05, + "loss": 0.6583, + "step": 4406, + "task_loss": 0.9646396636962891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2747921645641327, + "epoch": 3.72, + "learning_rate": 2.339089481946625e-05, + "loss": 0.6408, + "step": 4407, + "task_loss": 0.2483992576599121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3963993787765503, + "epoch": 3.73, + "learning_rate": 2.3384856901340417e-05, + "loss": 0.644, + "step": 4408, + "task_loss": 0.4907647371292114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5062950253486633, + "epoch": 3.73, + "learning_rate": 2.337881898321459e-05, + "loss": 0.5322, + "step": 4409, + "task_loss": 0.9860397577285767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.440014511346817, + "epoch": 3.73, + "learning_rate": 2.337278106508876e-05, + "loss": 0.6859, + "step": 4410, + "task_loss": 0.6240302920341492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44640499353408813, + "epoch": 3.73, + "learning_rate": 2.3366743146962926e-05, + "loss": 0.7425, + "step": 4411, + "task_loss": 0.6822888851165771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6823066473007202, + "epoch": 3.73, + "learning_rate": 2.33607052288371e-05, + "loss": 0.5774, + "step": 4412, + "task_loss": 0.5784011483192444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6787376403808594, + "epoch": 3.73, + "learning_rate": 2.3354667310711267e-05, + "loss": 0.5861, + "step": 4413, + "task_loss": 1.2307441234588623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5095716714859009, + "epoch": 3.73, + "learning_rate": 2.3348629392585437e-05, + "loss": 0.6124, + "step": 4414, + "task_loss": 1.086814522743225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0313618183135986, + "epoch": 3.73, + "learning_rate": 2.3342591474459608e-05, + "loss": 0.548, + "step": 4415, + "task_loss": 0.7202802896499634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1274170875549316, + "epoch": 3.73, + "learning_rate": 2.3336553556333775e-05, + "loss": 0.6686, + "step": 4416, + "task_loss": 0.7466042637825012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5515289306640625, + "epoch": 3.73, + "learning_rate": 2.333051563820795e-05, + "loss": 0.6705, + "step": 4417, + "task_loss": 1.4633358716964722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17373959720134735, + "epoch": 3.73, + "learning_rate": 2.3324477720082116e-05, + "loss": 0.6254, + "step": 4418, + "task_loss": 0.08294698596000671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39644840359687805, + "epoch": 3.73, + "learning_rate": 2.3318439801956287e-05, + "loss": 0.5264, + "step": 4419, + "task_loss": 0.48362573981285095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5280262231826782, + "epoch": 3.74, + "learning_rate": 2.3312401883830457e-05, + "loss": 0.5205, + "step": 4420, + "task_loss": 0.6926913857460022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6857432723045349, + "epoch": 3.74, + "learning_rate": 2.3306363965704624e-05, + "loss": 0.6407, + "step": 4421, + "task_loss": 0.9331052303314209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35209646821022034, + "epoch": 3.74, + "learning_rate": 2.3300326047578795e-05, + "loss": 0.5675, + "step": 4422, + "task_loss": 1.197452425956726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43328309059143066, + "epoch": 3.74, + "learning_rate": 2.3294288129452966e-05, + "loss": 0.5529, + "step": 4423, + "task_loss": 1.5100347995758057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6814221143722534, + "epoch": 3.74, + "learning_rate": 2.3288250211327136e-05, + "loss": 0.5762, + "step": 4424, + "task_loss": 1.0538145303726196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5875340700149536, + "epoch": 3.74, + "learning_rate": 2.3282212293201307e-05, + "loss": 0.5315, + "step": 4425, + "task_loss": 1.5315380096435547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5955626964569092, + "epoch": 3.74, + "learning_rate": 2.3276174375075474e-05, + "loss": 0.6559, + "step": 4426, + "task_loss": 0.336516410112381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5665879249572754, + "epoch": 3.74, + "learning_rate": 2.3270136456949645e-05, + "loss": 0.4703, + "step": 4427, + "task_loss": 0.6884249448776245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5615999102592468, + "epoch": 3.74, + "learning_rate": 2.3264098538823815e-05, + "loss": 0.6901, + "step": 4428, + "task_loss": 0.6692479848861694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44379621744155884, + "epoch": 3.74, + "learning_rate": 2.3258060620697986e-05, + "loss": 0.4697, + "step": 4429, + "task_loss": 0.16171132028102875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7368588447570801, + "epoch": 3.74, + "learning_rate": 2.3252022702572153e-05, + "loss": 0.9206, + "step": 4430, + "task_loss": 1.284806489944458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8409979939460754, + "epoch": 3.75, + "learning_rate": 2.3245984784446323e-05, + "loss": 0.6714, + "step": 4431, + "task_loss": 1.345111608505249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6538600325584412, + "epoch": 3.75, + "learning_rate": 2.3239946866320494e-05, + "loss": 0.5493, + "step": 4432, + "task_loss": 0.9229645133018494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7139259576797485, + "epoch": 3.75, + "learning_rate": 2.3233908948194665e-05, + "loss": 0.7461, + "step": 4433, + "task_loss": 0.712328314781189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6338579654693604, + "epoch": 3.75, + "learning_rate": 2.3227871030068835e-05, + "loss": 0.6145, + "step": 4434, + "task_loss": 0.5481329560279846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6404789686203003, + "epoch": 3.75, + "learning_rate": 2.3221833111943002e-05, + "loss": 0.5579, + "step": 4435, + "task_loss": 0.3292856514453888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6401785612106323, + "epoch": 3.75, + "learning_rate": 2.3215795193817173e-05, + "loss": 0.6548, + "step": 4436, + "task_loss": 0.8043035268783569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2880534529685974, + "epoch": 3.75, + "learning_rate": 2.3209757275691343e-05, + "loss": 0.5071, + "step": 4437, + "task_loss": 0.33581164479255676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9013627171516418, + "epoch": 3.75, + "learning_rate": 2.320371935756551e-05, + "loss": 0.6583, + "step": 4438, + "task_loss": 0.6649416089057922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44268637895584106, + "epoch": 3.75, + "learning_rate": 2.3197681439439685e-05, + "loss": 0.5224, + "step": 4439, + "task_loss": 0.267585426568985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4741356372833252, + "epoch": 3.75, + "learning_rate": 2.3191643521313852e-05, + "loss": 0.5593, + "step": 4440, + "task_loss": 1.284013032913208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9820587635040283, + "epoch": 3.75, + "learning_rate": 2.3185605603188022e-05, + "loss": 0.615, + "step": 4441, + "task_loss": 0.6287606954574585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4262923300266266, + "epoch": 3.75, + "learning_rate": 2.3179567685062193e-05, + "loss": 0.5218, + "step": 4442, + "task_loss": 0.6466645002365112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7400762438774109, + "epoch": 3.76, + "learning_rate": 2.317352976693636e-05, + "loss": 0.607, + "step": 4443, + "task_loss": 0.6763919591903687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6467039585113525, + "epoch": 3.76, + "learning_rate": 2.3167491848810534e-05, + "loss": 0.6712, + "step": 4444, + "task_loss": 1.332582712173462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5418936014175415, + "epoch": 3.76, + "learning_rate": 2.31614539306847e-05, + "loss": 0.4331, + "step": 4445, + "task_loss": 0.9271002411842346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48879605531692505, + "epoch": 3.76, + "learning_rate": 2.315541601255887e-05, + "loss": 0.6039, + "step": 4446, + "task_loss": 0.2636723816394806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7862344980239868, + "epoch": 3.76, + "learning_rate": 2.3149378094433042e-05, + "loss": 0.6406, + "step": 4447, + "task_loss": 0.5329148769378662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6804165840148926, + "epoch": 3.76, + "learning_rate": 2.314334017630721e-05, + "loss": 0.6231, + "step": 4448, + "task_loss": 1.711234211921692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2958729565143585, + "epoch": 3.76, + "learning_rate": 2.313730225818138e-05, + "loss": 0.4861, + "step": 4449, + "task_loss": 0.005031430162489414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6164088249206543, + "epoch": 3.76, + "learning_rate": 2.313126434005555e-05, + "loss": 0.6242, + "step": 4450, + "task_loss": 1.404719591140747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.605385422706604, + "epoch": 3.76, + "learning_rate": 2.3125226421929718e-05, + "loss": 0.6962, + "step": 4451, + "task_loss": 0.793285071849823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.168250560760498, + "epoch": 3.76, + "learning_rate": 2.3119188503803892e-05, + "loss": 0.6267, + "step": 4452, + "task_loss": 0.5260794758796692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5163881778717041, + "epoch": 3.76, + "learning_rate": 2.311315058567806e-05, + "loss": 0.4575, + "step": 4453, + "task_loss": 0.3756598234176636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.610989511013031, + "epoch": 3.76, + "learning_rate": 2.310711266755223e-05, + "loss": 0.4606, + "step": 4454, + "task_loss": 0.7098262906074524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6581740975379944, + "epoch": 3.77, + "learning_rate": 2.31010747494264e-05, + "loss": 0.7261, + "step": 4455, + "task_loss": 0.23161479830741882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.574173629283905, + "epoch": 3.77, + "learning_rate": 2.3095036831300567e-05, + "loss": 0.7126, + "step": 4456, + "task_loss": 0.8425107002258301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9820064306259155, + "epoch": 3.77, + "learning_rate": 2.3088998913174738e-05, + "loss": 0.5566, + "step": 4457, + "task_loss": 0.8681281805038452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42446669936180115, + "epoch": 3.77, + "learning_rate": 2.308296099504891e-05, + "loss": 0.6994, + "step": 4458, + "task_loss": 0.3902852237224579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31970882415771484, + "epoch": 3.77, + "learning_rate": 2.307692307692308e-05, + "loss": 0.49, + "step": 4459, + "task_loss": 0.356161504983902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40275195240974426, + "epoch": 3.77, + "learning_rate": 2.307088515879725e-05, + "loss": 0.5205, + "step": 4460, + "task_loss": 0.6699374914169312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48749005794525146, + "epoch": 3.77, + "learning_rate": 2.3064847240671417e-05, + "loss": 0.5133, + "step": 4461, + "task_loss": 0.36602315306663513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33166787028312683, + "epoch": 3.77, + "learning_rate": 2.3058809322545587e-05, + "loss": 0.4646, + "step": 4462, + "task_loss": 0.46980100870132446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4812576174736023, + "epoch": 3.77, + "learning_rate": 2.3052771404419758e-05, + "loss": 0.4984, + "step": 4463, + "task_loss": 0.46535345911979675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8003339171409607, + "epoch": 3.77, + "learning_rate": 2.304673348629393e-05, + "loss": 0.7401, + "step": 4464, + "task_loss": 1.0721266269683838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4290904402732849, + "epoch": 3.77, + "learning_rate": 2.3040695568168096e-05, + "loss": 0.5695, + "step": 4465, + "task_loss": 0.7608657479286194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43186700344085693, + "epoch": 3.77, + "learning_rate": 2.3034657650042266e-05, + "loss": 0.4622, + "step": 4466, + "task_loss": 0.4474439024925232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4712008833885193, + "epoch": 3.78, + "learning_rate": 2.3028619731916437e-05, + "loss": 0.4767, + "step": 4467, + "task_loss": 0.5881128907203674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5413079857826233, + "epoch": 3.78, + "learning_rate": 2.3022581813790607e-05, + "loss": 0.6125, + "step": 4468, + "task_loss": 0.4645686149597168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0193965435028076, + "epoch": 3.78, + "learning_rate": 2.3016543895664778e-05, + "loss": 0.7346, + "step": 4469, + "task_loss": 2.3894333839416504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3247479200363159, + "epoch": 3.78, + "learning_rate": 2.3010505977538945e-05, + "loss": 0.3696, + "step": 4470, + "task_loss": 0.05943428352475166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44149720668792725, + "epoch": 3.78, + "learning_rate": 2.3004468059413116e-05, + "loss": 0.5811, + "step": 4471, + "task_loss": 0.41300803422927856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6600164771080017, + "epoch": 3.78, + "learning_rate": 2.2998430141287286e-05, + "loss": 0.5751, + "step": 4472, + "task_loss": 0.46976733207702637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33291852474212646, + "epoch": 3.78, + "learning_rate": 2.2992392223161454e-05, + "loss": 0.4932, + "step": 4473, + "task_loss": 0.6892144680023193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.57462477684021, + "epoch": 3.78, + "learning_rate": 2.2986354305035627e-05, + "loss": 0.5333, + "step": 4474, + "task_loss": 0.29683396220207214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7026301622390747, + "epoch": 3.78, + "learning_rate": 2.2980316386909795e-05, + "loss": 0.7137, + "step": 4475, + "task_loss": 0.04255634546279907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20254918932914734, + "epoch": 3.78, + "learning_rate": 2.2974278468783962e-05, + "loss": 0.5424, + "step": 4476, + "task_loss": 0.524287223815918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7224549055099487, + "epoch": 3.78, + "learning_rate": 2.2968240550658136e-05, + "loss": 0.5321, + "step": 4477, + "task_loss": 0.7353062629699707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48610368371009827, + "epoch": 3.78, + "learning_rate": 2.2962202632532303e-05, + "loss": 0.5873, + "step": 4478, + "task_loss": 1.062229871749878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46096593141555786, + "epoch": 3.79, + "learning_rate": 2.2956164714406474e-05, + "loss": 0.4934, + "step": 4479, + "task_loss": 0.41122597455978394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4773593544960022, + "epoch": 3.79, + "learning_rate": 2.2950126796280644e-05, + "loss": 0.4989, + "step": 4480, + "task_loss": 0.08708696067333221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6095165014266968, + "epoch": 3.79, + "learning_rate": 2.294408887815481e-05, + "loss": 0.5877, + "step": 4481, + "task_loss": 0.6403176188468933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4502122700214386, + "epoch": 3.79, + "learning_rate": 2.2938050960028985e-05, + "loss": 0.582, + "step": 4482, + "task_loss": 0.9707988500595093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38119742274284363, + "epoch": 3.79, + "learning_rate": 2.2932013041903152e-05, + "loss": 0.4812, + "step": 4483, + "task_loss": 0.16779783368110657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4454629421234131, + "epoch": 3.79, + "learning_rate": 2.2925975123777323e-05, + "loss": 0.5661, + "step": 4484, + "task_loss": 1.1619364023208618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5072379112243652, + "epoch": 3.79, + "learning_rate": 2.2919937205651494e-05, + "loss": 0.5655, + "step": 4485, + "task_loss": 0.7217161655426025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6898370981216431, + "epoch": 3.79, + "learning_rate": 2.291389928752566e-05, + "loss": 0.528, + "step": 4486, + "task_loss": 1.4738593101501465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7601476311683655, + "epoch": 3.79, + "learning_rate": 2.290786136939983e-05, + "loss": 0.5276, + "step": 4487, + "task_loss": 1.811610460281372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6716529130935669, + "epoch": 3.79, + "learning_rate": 2.2901823451274002e-05, + "loss": 0.6035, + "step": 4488, + "task_loss": 1.3733972311019897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4354214668273926, + "epoch": 3.79, + "learning_rate": 2.2895785533148172e-05, + "loss": 0.4646, + "step": 4489, + "task_loss": 0.6362360715866089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4062591791152954, + "epoch": 3.79, + "learning_rate": 2.2889747615022343e-05, + "loss": 0.445, + "step": 4490, + "task_loss": 1.172103762626648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4040986895561218, + "epoch": 3.8, + "learning_rate": 2.288370969689651e-05, + "loss": 0.5384, + "step": 4491, + "task_loss": 0.6108687520027161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4843246638774872, + "epoch": 3.8, + "learning_rate": 2.287767177877068e-05, + "loss": 0.6359, + "step": 4492, + "task_loss": 0.8827519416809082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6765078902244568, + "epoch": 3.8, + "learning_rate": 2.287163386064485e-05, + "loss": 0.6901, + "step": 4493, + "task_loss": 1.3149833679199219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7701029777526855, + "epoch": 3.8, + "learning_rate": 2.286559594251902e-05, + "loss": 0.5815, + "step": 4494, + "task_loss": 0.8802153468132019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6770055890083313, + "epoch": 3.8, + "learning_rate": 2.285955802439319e-05, + "loss": 0.468, + "step": 4495, + "task_loss": 0.7696042656898499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8490389585494995, + "epoch": 3.8, + "learning_rate": 2.285352010626736e-05, + "loss": 0.6947, + "step": 4496, + "task_loss": 1.1617990732192993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5762185454368591, + "epoch": 3.8, + "learning_rate": 2.284748218814153e-05, + "loss": 0.5932, + "step": 4497, + "task_loss": 0.6689549088478088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7190088629722595, + "epoch": 3.8, + "learning_rate": 2.28414442700157e-05, + "loss": 0.5068, + "step": 4498, + "task_loss": 0.418878436088562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.258762001991272, + "epoch": 3.8, + "learning_rate": 2.2835406351889868e-05, + "loss": 0.532, + "step": 4499, + "task_loss": 0.11511826515197754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27852538228034973, + "epoch": 3.8, + "learning_rate": 2.282936843376404e-05, + "loss": 0.6075, + "step": 4500, + "task_loss": 1.213998556137085 + }, + { + "epoch": 3.8, + "eval_accuracy": 0.9011485148514852, + "eval_loss": 0.3505952060222626, + "eval_runtime": 227.5401, + "eval_samples_per_second": 110.969, + "eval_steps_per_second": 0.87, + "step": 4500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8224712014198303, + "epoch": 3.8, + "learning_rate": 2.282333051563821e-05, + "loss": 0.6948, + "step": 4501, + "task_loss": 0.3107118308544159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37828683853149414, + "epoch": 3.81, + "learning_rate": 2.281729259751238e-05, + "loss": 0.5334, + "step": 4502, + "task_loss": 0.12263364344835281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45930367708206177, + "epoch": 3.81, + "learning_rate": 2.2811254679386547e-05, + "loss": 0.4597, + "step": 4503, + "task_loss": 0.5213666558265686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3877778649330139, + "epoch": 3.81, + "learning_rate": 2.2805216761260718e-05, + "loss": 0.5537, + "step": 4504, + "task_loss": 0.23105968534946442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5131558179855347, + "epoch": 3.81, + "learning_rate": 2.2799178843134888e-05, + "loss": 0.5952, + "step": 4505, + "task_loss": 0.6777783036231995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7000924348831177, + "epoch": 3.81, + "learning_rate": 2.279314092500906e-05, + "loss": 0.5613, + "step": 4506, + "task_loss": 1.1955558061599731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6547091603279114, + "epoch": 3.81, + "learning_rate": 2.278710300688323e-05, + "loss": 0.4627, + "step": 4507, + "task_loss": 1.5050053596496582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27803266048431396, + "epoch": 3.81, + "learning_rate": 2.2781065088757396e-05, + "loss": 0.4563, + "step": 4508, + "task_loss": 0.48284903168678284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8124231100082397, + "epoch": 3.81, + "learning_rate": 2.2775027170631567e-05, + "loss": 0.589, + "step": 4509, + "task_loss": 0.3941148817539215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.221726417541504, + "epoch": 3.81, + "learning_rate": 2.2768989252505738e-05, + "loss": 0.7193, + "step": 4510, + "task_loss": 1.5887213945388794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7639302015304565, + "epoch": 3.81, + "learning_rate": 2.2762951334379905e-05, + "loss": 0.6239, + "step": 4511, + "task_loss": 0.7167043685913086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9234924912452698, + "epoch": 3.81, + "learning_rate": 2.275691341625408e-05, + "loss": 0.6194, + "step": 4512, + "task_loss": 1.0194989442825317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4466555416584015, + "epoch": 3.81, + "learning_rate": 2.2750875498128246e-05, + "loss": 0.4811, + "step": 4513, + "task_loss": 0.60932457447052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40963804721832275, + "epoch": 3.82, + "learning_rate": 2.2744837580002416e-05, + "loss": 0.5943, + "step": 4514, + "task_loss": 0.33377841114997864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39531293511390686, + "epoch": 3.82, + "learning_rate": 2.2738799661876587e-05, + "loss": 0.3894, + "step": 4515, + "task_loss": 0.9314152598381042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3533097505569458, + "epoch": 3.82, + "learning_rate": 2.2732761743750754e-05, + "loss": 0.4136, + "step": 4516, + "task_loss": 0.23366022109985352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42140284180641174, + "epoch": 3.82, + "learning_rate": 2.2726723825624928e-05, + "loss": 0.6782, + "step": 4517, + "task_loss": 1.0073362588882446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4779048562049866, + "epoch": 3.82, + "learning_rate": 2.2720685907499095e-05, + "loss": 0.5234, + "step": 4518, + "task_loss": 0.7014445662498474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4349392354488373, + "epoch": 3.82, + "learning_rate": 2.2714647989373263e-05, + "loss": 0.4053, + "step": 4519, + "task_loss": 0.6497784852981567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41986557841300964, + "epoch": 3.82, + "learning_rate": 2.2708610071247436e-05, + "loss": 0.4378, + "step": 4520, + "task_loss": 0.30898967385292053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44661352038383484, + "epoch": 3.82, + "learning_rate": 2.2702572153121604e-05, + "loss": 0.52, + "step": 4521, + "task_loss": 1.5813034772872925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5483782291412354, + "epoch": 3.82, + "learning_rate": 2.2696534234995774e-05, + "loss": 0.5193, + "step": 4522, + "task_loss": 0.13440847396850586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6100765466690063, + "epoch": 3.82, + "learning_rate": 2.2690496316869945e-05, + "loss": 0.6274, + "step": 4523, + "task_loss": 0.2875552177429199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5049988627433777, + "epoch": 3.82, + "learning_rate": 2.2684458398744112e-05, + "loss": 0.5665, + "step": 4524, + "task_loss": 0.8494818806648254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8322737216949463, + "epoch": 3.82, + "learning_rate": 2.2678420480618286e-05, + "loss": 0.6605, + "step": 4525, + "task_loss": 0.4764504134654999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4672878086566925, + "epoch": 3.83, + "learning_rate": 2.2672382562492453e-05, + "loss": 0.5977, + "step": 4526, + "task_loss": 0.8541036248207092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6046249866485596, + "epoch": 3.83, + "learning_rate": 2.2666344644366624e-05, + "loss": 0.5542, + "step": 4527, + "task_loss": 0.781278133392334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4827496409416199, + "epoch": 3.83, + "learning_rate": 2.2660306726240794e-05, + "loss": 0.5586, + "step": 4528, + "task_loss": 0.9666576981544495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7295994758605957, + "epoch": 3.83, + "learning_rate": 2.265426880811496e-05, + "loss": 0.5453, + "step": 4529, + "task_loss": 0.2307213395833969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7430315613746643, + "epoch": 3.83, + "learning_rate": 2.2648230889989132e-05, + "loss": 0.6353, + "step": 4530, + "task_loss": 0.7285252809524536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6091195344924927, + "epoch": 3.83, + "learning_rate": 2.2642192971863303e-05, + "loss": 0.6816, + "step": 4531, + "task_loss": 1.1465976238250732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5833025574684143, + "epoch": 3.83, + "learning_rate": 2.2636155053737473e-05, + "loss": 0.6652, + "step": 4532, + "task_loss": 0.520223081111908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9356515407562256, + "epoch": 3.83, + "learning_rate": 2.263011713561164e-05, + "loss": 0.6789, + "step": 4533, + "task_loss": 0.8323215246200562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5891568660736084, + "epoch": 3.83, + "learning_rate": 2.262407921748581e-05, + "loss": 0.4938, + "step": 4534, + "task_loss": 0.3939654529094696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8318091034889221, + "epoch": 3.83, + "learning_rate": 2.261804129935998e-05, + "loss": 0.723, + "step": 4535, + "task_loss": 2.1684114933013916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4679873585700989, + "epoch": 3.83, + "learning_rate": 2.2612003381234152e-05, + "loss": 0.5138, + "step": 4536, + "task_loss": 0.41602635383605957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5540313720703125, + "epoch": 3.83, + "learning_rate": 2.2605965463108323e-05, + "loss": 0.5463, + "step": 4537, + "task_loss": 0.6549892425537109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42273813486099243, + "epoch": 3.84, + "learning_rate": 2.259992754498249e-05, + "loss": 0.5119, + "step": 4538, + "task_loss": 0.43286681175231934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4999549984931946, + "epoch": 3.84, + "learning_rate": 2.259388962685666e-05, + "loss": 0.5005, + "step": 4539, + "task_loss": 1.000728726387024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7849093079566956, + "epoch": 3.84, + "learning_rate": 2.258785170873083e-05, + "loss": 0.788, + "step": 4540, + "task_loss": 0.9882803559303284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46597665548324585, + "epoch": 3.84, + "learning_rate": 2.2581813790604998e-05, + "loss": 0.545, + "step": 4541, + "task_loss": 0.5382664799690247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5338019132614136, + "epoch": 3.84, + "learning_rate": 2.2575775872479172e-05, + "loss": 0.5392, + "step": 4542, + "task_loss": 0.30885669589042664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7065733075141907, + "epoch": 3.84, + "learning_rate": 2.256973795435334e-05, + "loss": 0.6024, + "step": 4543, + "task_loss": 1.2602601051330566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5769373178482056, + "epoch": 3.84, + "learning_rate": 2.256370003622751e-05, + "loss": 0.6799, + "step": 4544, + "task_loss": 0.5817105770111084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5890848636627197, + "epoch": 3.84, + "learning_rate": 2.255766211810168e-05, + "loss": 0.6889, + "step": 4545, + "task_loss": 0.142110213637352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.527262806892395, + "epoch": 3.84, + "learning_rate": 2.2551624199975848e-05, + "loss": 0.5109, + "step": 4546, + "task_loss": 1.1303462982177734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.530597984790802, + "epoch": 3.84, + "learning_rate": 2.254558628185002e-05, + "loss": 0.5752, + "step": 4547, + "task_loss": 0.8478494882583618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5579608082771301, + "epoch": 3.84, + "learning_rate": 2.253954836372419e-05, + "loss": 0.5511, + "step": 4548, + "task_loss": 1.1049139499664307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4899434745311737, + "epoch": 3.84, + "learning_rate": 2.2533510445598356e-05, + "loss": 0.57, + "step": 4549, + "task_loss": 0.872226357460022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6326770782470703, + "epoch": 3.85, + "learning_rate": 2.252747252747253e-05, + "loss": 0.663, + "step": 4550, + "task_loss": 0.5758929252624512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3767385482788086, + "epoch": 3.85, + "learning_rate": 2.2521434609346697e-05, + "loss": 0.554, + "step": 4551, + "task_loss": 0.6987469792366028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.536481499671936, + "epoch": 3.85, + "learning_rate": 2.2515396691220868e-05, + "loss": 0.5169, + "step": 4552, + "task_loss": 0.5736947655677795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4679645597934723, + "epoch": 3.85, + "learning_rate": 2.2509358773095038e-05, + "loss": 0.6493, + "step": 4553, + "task_loss": 1.4364482164382935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5526278018951416, + "epoch": 3.85, + "learning_rate": 2.2503320854969205e-05, + "loss": 0.4566, + "step": 4554, + "task_loss": 0.7572150826454163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6572176218032837, + "epoch": 3.85, + "learning_rate": 2.249728293684338e-05, + "loss": 0.6603, + "step": 4555, + "task_loss": 1.5873498916625977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5072602033615112, + "epoch": 3.85, + "learning_rate": 2.2491245018717547e-05, + "loss": 0.5489, + "step": 4556, + "task_loss": 0.3946949243545532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5887047052383423, + "epoch": 3.85, + "learning_rate": 2.2485207100591717e-05, + "loss": 0.4857, + "step": 4557, + "task_loss": 0.32903701066970825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6570155620574951, + "epoch": 3.85, + "learning_rate": 2.2479169182465888e-05, + "loss": 0.6031, + "step": 4558, + "task_loss": 0.5098884105682373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46782779693603516, + "epoch": 3.85, + "learning_rate": 2.2473131264340055e-05, + "loss": 0.5025, + "step": 4559, + "task_loss": 0.6768575310707092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6555294990539551, + "epoch": 3.85, + "learning_rate": 2.2467093346214225e-05, + "loss": 0.638, + "step": 4560, + "task_loss": 1.189415454864502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3998187780380249, + "epoch": 3.85, + "learning_rate": 2.2461055428088396e-05, + "loss": 0.4107, + "step": 4561, + "task_loss": 0.3535372316837311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7637245655059814, + "epoch": 3.86, + "learning_rate": 2.2455017509962567e-05, + "loss": 0.557, + "step": 4562, + "task_loss": 0.30867066979408264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41419517993927, + "epoch": 3.86, + "learning_rate": 2.2448979591836737e-05, + "loss": 0.6637, + "step": 4563, + "task_loss": 0.5748609304428101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3557615280151367, + "epoch": 3.86, + "learning_rate": 2.2442941673710904e-05, + "loss": 0.4002, + "step": 4564, + "task_loss": 0.19731488823890686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5281579494476318, + "epoch": 3.86, + "learning_rate": 2.2436903755585075e-05, + "loss": 0.4938, + "step": 4565, + "task_loss": 0.12420077621936798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3770504593849182, + "epoch": 3.86, + "learning_rate": 2.2430865837459245e-05, + "loss": 0.5466, + "step": 4566, + "task_loss": 0.49097180366516113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.596808910369873, + "epoch": 3.86, + "learning_rate": 2.2424827919333416e-05, + "loss": 0.527, + "step": 4567, + "task_loss": 0.3285331726074219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6039056777954102, + "epoch": 3.86, + "learning_rate": 2.2418790001207583e-05, + "loss": 0.597, + "step": 4568, + "task_loss": 0.8137860298156738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23368346691131592, + "epoch": 3.86, + "learning_rate": 2.2412752083081754e-05, + "loss": 0.5114, + "step": 4569, + "task_loss": 0.0795210674405098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5441118478775024, + "epoch": 3.86, + "learning_rate": 2.2406714164955924e-05, + "loss": 0.629, + "step": 4570, + "task_loss": 0.772313117980957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4644792973995209, + "epoch": 3.86, + "learning_rate": 2.2400676246830095e-05, + "loss": 0.5058, + "step": 4571, + "task_loss": 0.9556580781936646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2741735875606537, + "epoch": 3.86, + "learning_rate": 2.2394638328704266e-05, + "loss": 0.6365, + "step": 4572, + "task_loss": 0.32410821318626404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.804934561252594, + "epoch": 3.87, + "learning_rate": 2.2388600410578433e-05, + "loss": 0.5288, + "step": 4573, + "task_loss": 0.8876804113388062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4231383800506592, + "epoch": 3.87, + "learning_rate": 2.2382562492452603e-05, + "loss": 0.6129, + "step": 4574, + "task_loss": 0.540960967540741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43521153926849365, + "epoch": 3.87, + "learning_rate": 2.2376524574326774e-05, + "loss": 0.5655, + "step": 4575, + "task_loss": 0.44945383071899414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23040595650672913, + "epoch": 3.87, + "learning_rate": 2.237048665620094e-05, + "loss": 0.4342, + "step": 4576, + "task_loss": 0.4913996458053589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.4340713024139404, + "epoch": 3.87, + "learning_rate": 2.2364448738075115e-05, + "loss": 0.7879, + "step": 4577, + "task_loss": 1.3690781593322754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5097252726554871, + "epoch": 3.87, + "learning_rate": 2.2358410819949282e-05, + "loss": 0.487, + "step": 4578, + "task_loss": 0.5746940970420837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5900717973709106, + "epoch": 3.87, + "learning_rate": 2.2352372901823453e-05, + "loss": 0.5991, + "step": 4579, + "task_loss": 1.0881357192993164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3439572751522064, + "epoch": 3.87, + "learning_rate": 2.2346334983697623e-05, + "loss": 0.5441, + "step": 4580, + "task_loss": 0.6399039030075073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6858565807342529, + "epoch": 3.87, + "learning_rate": 2.234029706557179e-05, + "loss": 0.5342, + "step": 4581, + "task_loss": 0.2370939403772354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45418429374694824, + "epoch": 3.87, + "learning_rate": 2.2334259147445964e-05, + "loss": 0.5826, + "step": 4582, + "task_loss": 0.1713087111711502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6021031737327576, + "epoch": 3.87, + "learning_rate": 2.232822122932013e-05, + "loss": 0.4871, + "step": 4583, + "task_loss": 0.502464234828949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9733784198760986, + "epoch": 3.87, + "learning_rate": 2.23221833111943e-05, + "loss": 0.642, + "step": 4584, + "task_loss": 0.7405000329017639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46803581714630127, + "epoch": 3.88, + "learning_rate": 2.2316145393068473e-05, + "loss": 0.5062, + "step": 4585, + "task_loss": 1.227565884590149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8267889022827148, + "epoch": 3.88, + "learning_rate": 2.231010747494264e-05, + "loss": 0.6598, + "step": 4586, + "task_loss": 0.40738967061042786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38898128271102905, + "epoch": 3.88, + "learning_rate": 2.230406955681681e-05, + "loss": 0.5098, + "step": 4587, + "task_loss": 0.023207519203424454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4987633526325226, + "epoch": 3.88, + "learning_rate": 2.229803163869098e-05, + "loss": 0.4669, + "step": 4588, + "task_loss": 0.48157164454460144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.612311840057373, + "epoch": 3.88, + "learning_rate": 2.2291993720565148e-05, + "loss": 0.5674, + "step": 4589, + "task_loss": 1.6587328910827637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9272332191467285, + "epoch": 3.88, + "learning_rate": 2.2285955802439322e-05, + "loss": 0.7854, + "step": 4590, + "task_loss": 1.8588898181915283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5013290643692017, + "epoch": 3.88, + "learning_rate": 2.227991788431349e-05, + "loss": 0.6576, + "step": 4591, + "task_loss": 1.0973527431488037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.641532301902771, + "epoch": 3.88, + "learning_rate": 2.227387996618766e-05, + "loss": 0.5807, + "step": 4592, + "task_loss": 1.6154718399047852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7165365815162659, + "epoch": 3.88, + "learning_rate": 2.226784204806183e-05, + "loss": 0.5641, + "step": 4593, + "task_loss": 1.4866337776184082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7219924330711365, + "epoch": 3.88, + "learning_rate": 2.2261804129935998e-05, + "loss": 0.6215, + "step": 4594, + "task_loss": 1.0490946769714355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4483376145362854, + "epoch": 3.88, + "learning_rate": 2.225576621181017e-05, + "loss": 0.6278, + "step": 4595, + "task_loss": 0.782343864440918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6821736693382263, + "epoch": 3.88, + "learning_rate": 2.224972829368434e-05, + "loss": 0.7046, + "step": 4596, + "task_loss": 1.5550223588943481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0510276556015015, + "epoch": 3.89, + "learning_rate": 2.224369037555851e-05, + "loss": 0.5766, + "step": 4597, + "task_loss": 0.7875450253486633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34397757053375244, + "epoch": 3.89, + "learning_rate": 2.2237652457432677e-05, + "loss": 0.4682, + "step": 4598, + "task_loss": 0.11856172233819962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7001373767852783, + "epoch": 3.89, + "learning_rate": 2.2231614539306847e-05, + "loss": 0.4381, + "step": 4599, + "task_loss": 0.4853045344352722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37415122985839844, + "epoch": 3.89, + "learning_rate": 2.2225576621181018e-05, + "loss": 0.4368, + "step": 4600, + "task_loss": 1.010152816772461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5745044350624084, + "epoch": 3.89, + "learning_rate": 2.221953870305519e-05, + "loss": 0.5285, + "step": 4601, + "task_loss": 0.4524754285812378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3566026985645294, + "epoch": 3.89, + "learning_rate": 2.221350078492936e-05, + "loss": 0.5408, + "step": 4602, + "task_loss": 0.08953599631786346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4229121208190918, + "epoch": 3.89, + "learning_rate": 2.2207462866803526e-05, + "loss": 0.7673, + "step": 4603, + "task_loss": 0.07930503040552139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7159371376037598, + "epoch": 3.89, + "learning_rate": 2.2201424948677697e-05, + "loss": 0.6315, + "step": 4604, + "task_loss": 1.2674273252487183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.571043848991394, + "epoch": 3.89, + "learning_rate": 2.2195387030551867e-05, + "loss": 0.583, + "step": 4605, + "task_loss": 1.2315365076065063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5418980717658997, + "epoch": 3.89, + "learning_rate": 2.2189349112426034e-05, + "loss": 0.7563, + "step": 4606, + "task_loss": 0.5784525275230408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4133026897907257, + "epoch": 3.89, + "learning_rate": 2.218331119430021e-05, + "loss": 0.4251, + "step": 4607, + "task_loss": 0.3497888147830963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3941519558429718, + "epoch": 3.89, + "learning_rate": 2.2177273276174376e-05, + "loss": 0.5483, + "step": 4608, + "task_loss": 0.36794236302375793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4497247338294983, + "epoch": 3.9, + "learning_rate": 2.2171235358048546e-05, + "loss": 0.5625, + "step": 4609, + "task_loss": 0.5561099052429199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4860917329788208, + "epoch": 3.9, + "learning_rate": 2.2165197439922717e-05, + "loss": 0.6227, + "step": 4610, + "task_loss": 0.8884359002113342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9359780550003052, + "epoch": 3.9, + "learning_rate": 2.2159159521796884e-05, + "loss": 0.6513, + "step": 4611, + "task_loss": 0.9636844396591187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8478249311447144, + "epoch": 3.9, + "learning_rate": 2.2153121603671058e-05, + "loss": 0.689, + "step": 4612, + "task_loss": 0.7148102521896362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41006070375442505, + "epoch": 3.9, + "learning_rate": 2.2147083685545225e-05, + "loss": 0.4983, + "step": 4613, + "task_loss": 0.46146488189697266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2889275848865509, + "epoch": 3.9, + "learning_rate": 2.2141045767419392e-05, + "loss": 0.6655, + "step": 4614, + "task_loss": 0.41868874430656433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5516269207000732, + "epoch": 3.9, + "learning_rate": 2.2135007849293566e-05, + "loss": 0.4877, + "step": 4615, + "task_loss": 0.5106433629989624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5226693749427795, + "epoch": 3.9, + "learning_rate": 2.2128969931167733e-05, + "loss": 0.661, + "step": 4616, + "task_loss": 0.321419358253479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45152196288108826, + "epoch": 3.9, + "learning_rate": 2.2122932013041904e-05, + "loss": 0.5746, + "step": 4617, + "task_loss": 1.1926885843276978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4267456531524658, + "epoch": 3.9, + "learning_rate": 2.2116894094916075e-05, + "loss": 0.6823, + "step": 4618, + "task_loss": 0.8775150775909424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4764678478240967, + "epoch": 3.9, + "learning_rate": 2.2110856176790242e-05, + "loss": 0.7081, + "step": 4619, + "task_loss": 0.9472035765647888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5739539861679077, + "epoch": 3.9, + "learning_rate": 2.2104818258664416e-05, + "loss": 0.6143, + "step": 4620, + "task_loss": 1.3189697265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5950912237167358, + "epoch": 3.91, + "learning_rate": 2.2098780340538583e-05, + "loss": 0.7556, + "step": 4621, + "task_loss": 0.34443068504333496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.843063473701477, + "epoch": 3.91, + "learning_rate": 2.2092742422412753e-05, + "loss": 0.6551, + "step": 4622, + "task_loss": 1.0514072179794312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6285462975502014, + "epoch": 3.91, + "learning_rate": 2.2086704504286924e-05, + "loss": 0.7954, + "step": 4623, + "task_loss": 1.5145877599716187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9002059698104858, + "epoch": 3.91, + "learning_rate": 2.208066658616109e-05, + "loss": 0.6222, + "step": 4624, + "task_loss": 0.5354745388031006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6272610425949097, + "epoch": 3.91, + "learning_rate": 2.2074628668035262e-05, + "loss": 0.6092, + "step": 4625, + "task_loss": 1.33392333984375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47450876235961914, + "epoch": 3.91, + "learning_rate": 2.2068590749909432e-05, + "loss": 0.5909, + "step": 4626, + "task_loss": 0.6318022012710571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7187767028808594, + "epoch": 3.91, + "learning_rate": 2.2062552831783603e-05, + "loss": 0.632, + "step": 4627, + "task_loss": 0.6666658520698547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35812804102897644, + "epoch": 3.91, + "learning_rate": 2.2056514913657773e-05, + "loss": 0.4955, + "step": 4628, + "task_loss": 1.4737628698349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.695568859577179, + "epoch": 3.91, + "learning_rate": 2.205047699553194e-05, + "loss": 0.6705, + "step": 4629, + "task_loss": 1.3606468439102173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8018062114715576, + "epoch": 3.91, + "learning_rate": 2.204443907740611e-05, + "loss": 0.5904, + "step": 4630, + "task_loss": 0.9940068125724792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39298880100250244, + "epoch": 3.91, + "learning_rate": 2.2038401159280282e-05, + "loss": 0.4934, + "step": 4631, + "task_loss": 0.48967358469963074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6605417728424072, + "epoch": 3.91, + "learning_rate": 2.2032363241154452e-05, + "loss": 0.4987, + "step": 4632, + "task_loss": 1.1239429712295532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5846738815307617, + "epoch": 3.92, + "learning_rate": 2.202632532302862e-05, + "loss": 0.6379, + "step": 4633, + "task_loss": 0.9471979737281799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28248295187950134, + "epoch": 3.92, + "learning_rate": 2.202028740490279e-05, + "loss": 0.5512, + "step": 4634, + "task_loss": 0.10429779440164566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5506902933120728, + "epoch": 3.92, + "learning_rate": 2.201424948677696e-05, + "loss": 0.6641, + "step": 4635, + "task_loss": 0.960847020149231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9819222688674927, + "epoch": 3.92, + "learning_rate": 2.200821156865113e-05, + "loss": 0.7708, + "step": 4636, + "task_loss": 0.5021255612373352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7932045459747314, + "epoch": 3.92, + "learning_rate": 2.2002173650525302e-05, + "loss": 0.6882, + "step": 4637, + "task_loss": 1.3558275699615479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45658987760543823, + "epoch": 3.92, + "learning_rate": 2.199613573239947e-05, + "loss": 0.5658, + "step": 4638, + "task_loss": 0.4359578490257263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2630814015865326, + "epoch": 3.92, + "learning_rate": 2.199009781427364e-05, + "loss": 0.4593, + "step": 4639, + "task_loss": 0.04308855161070824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2529233396053314, + "epoch": 3.92, + "learning_rate": 2.198405989614781e-05, + "loss": 0.3448, + "step": 4640, + "task_loss": 0.6680459380149841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4133487939834595, + "epoch": 3.92, + "learning_rate": 2.1978021978021977e-05, + "loss": 0.5052, + "step": 4641, + "task_loss": 0.19064028561115265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3400127589702606, + "epoch": 3.92, + "learning_rate": 2.197198405989615e-05, + "loss": 0.5259, + "step": 4642, + "task_loss": 0.050473976880311966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30520859360694885, + "epoch": 3.92, + "learning_rate": 2.196594614177032e-05, + "loss": 0.713, + "step": 4643, + "task_loss": 0.6662929058074951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7541505098342896, + "epoch": 3.93, + "learning_rate": 2.195990822364449e-05, + "loss": 0.4854, + "step": 4644, + "task_loss": 0.4065546691417694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8105065226554871, + "epoch": 3.93, + "learning_rate": 2.195387030551866e-05, + "loss": 0.6499, + "step": 4645, + "task_loss": 0.3908928632736206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41057074069976807, + "epoch": 3.93, + "learning_rate": 2.1947832387392827e-05, + "loss": 0.6082, + "step": 4646, + "task_loss": 0.2025093138217926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6684619188308716, + "epoch": 3.93, + "learning_rate": 2.1941794469267e-05, + "loss": 0.6339, + "step": 4647, + "task_loss": 1.867443323135376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5535681247711182, + "epoch": 3.93, + "learning_rate": 2.1935756551141168e-05, + "loss": 0.6595, + "step": 4648, + "task_loss": 0.9673780798912048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5286191701889038, + "epoch": 3.93, + "learning_rate": 2.1929718633015335e-05, + "loss": 0.5135, + "step": 4649, + "task_loss": 0.16273073852062225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6012125015258789, + "epoch": 3.93, + "learning_rate": 2.192368071488951e-05, + "loss": 0.5812, + "step": 4650, + "task_loss": 0.31297779083251953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41989293694496155, + "epoch": 3.93, + "learning_rate": 2.1917642796763676e-05, + "loss": 0.5901, + "step": 4651, + "task_loss": 1.8514149188995361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7118070125579834, + "epoch": 3.93, + "learning_rate": 2.1911604878637847e-05, + "loss": 0.5651, + "step": 4652, + "task_loss": 0.3118521273136139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46281954646110535, + "epoch": 3.93, + "learning_rate": 2.1905566960512017e-05, + "loss": 0.5534, + "step": 4653, + "task_loss": 0.9512856006622314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8008638024330139, + "epoch": 3.93, + "learning_rate": 2.1899529042386185e-05, + "loss": 0.7491, + "step": 4654, + "task_loss": 0.4393361210823059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6376082301139832, + "epoch": 3.93, + "learning_rate": 2.189349112426036e-05, + "loss": 0.6085, + "step": 4655, + "task_loss": 0.510701060295105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7970024943351746, + "epoch": 3.94, + "learning_rate": 2.1887453206134526e-05, + "loss": 0.6141, + "step": 4656, + "task_loss": 0.9412413239479065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49286746978759766, + "epoch": 3.94, + "learning_rate": 2.1881415288008696e-05, + "loss": 0.5088, + "step": 4657, + "task_loss": 0.2836083471775055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5873324871063232, + "epoch": 3.94, + "learning_rate": 2.1875377369882867e-05, + "loss": 0.4445, + "step": 4658, + "task_loss": 1.1917039155960083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8191130757331848, + "epoch": 3.94, + "learning_rate": 2.1869339451757034e-05, + "loss": 0.5338, + "step": 4659, + "task_loss": 0.5051819086074829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2859245240688324, + "epoch": 3.94, + "learning_rate": 2.1863301533631205e-05, + "loss": 0.6807, + "step": 4660, + "task_loss": 1.2019400596618652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36683204770088196, + "epoch": 3.94, + "learning_rate": 2.1857263615505375e-05, + "loss": 0.5412, + "step": 4661, + "task_loss": 1.4809026718139648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4598342776298523, + "epoch": 3.94, + "learning_rate": 2.1851225697379546e-05, + "loss": 0.5232, + "step": 4662, + "task_loss": 0.1396491676568985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6264371871948242, + "epoch": 3.94, + "learning_rate": 2.1845187779253713e-05, + "loss": 0.7501, + "step": 4663, + "task_loss": 0.33619552850723267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5229457020759583, + "epoch": 3.94, + "learning_rate": 2.1839149861127884e-05, + "loss": 0.5529, + "step": 4664, + "task_loss": 0.26591020822525024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4617083668708801, + "epoch": 3.94, + "learning_rate": 2.1833111943002054e-05, + "loss": 0.4537, + "step": 4665, + "task_loss": 0.9166950583457947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.623773992061615, + "epoch": 3.94, + "learning_rate": 2.1827074024876225e-05, + "loss": 0.5949, + "step": 4666, + "task_loss": 0.3137950897216797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6860369443893433, + "epoch": 3.94, + "learning_rate": 2.1821036106750395e-05, + "loss": 0.736, + "step": 4667, + "task_loss": 1.0741709470748901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4295746386051178, + "epoch": 3.95, + "learning_rate": 2.1814998188624562e-05, + "loss": 0.578, + "step": 4668, + "task_loss": 0.8879985809326172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4842042326927185, + "epoch": 3.95, + "learning_rate": 2.1808960270498733e-05, + "loss": 0.7262, + "step": 4669, + "task_loss": 1.3817460536956787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3290368914604187, + "epoch": 3.95, + "learning_rate": 2.1802922352372904e-05, + "loss": 0.4436, + "step": 4670, + "task_loss": 0.3827207684516907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6237121224403381, + "epoch": 3.95, + "learning_rate": 2.179688443424707e-05, + "loss": 0.4726, + "step": 4671, + "task_loss": 0.996856153011322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2989993691444397, + "epoch": 3.95, + "learning_rate": 2.179084651612124e-05, + "loss": 0.5603, + "step": 4672, + "task_loss": 0.8256833553314209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5866856575012207, + "epoch": 3.95, + "learning_rate": 2.1784808597995412e-05, + "loss": 0.6637, + "step": 4673, + "task_loss": 0.6276038289070129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7544771432876587, + "epoch": 3.95, + "learning_rate": 2.1778770679869582e-05, + "loss": 0.6367, + "step": 4674, + "task_loss": 0.6560112833976746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6964298486709595, + "epoch": 3.95, + "learning_rate": 2.1772732761743753e-05, + "loss": 0.6865, + "step": 4675, + "task_loss": 0.3552161157131195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5033692121505737, + "epoch": 3.95, + "learning_rate": 2.176669484361792e-05, + "loss": 0.5288, + "step": 4676, + "task_loss": 0.8332726955413818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.74857097864151, + "epoch": 3.95, + "learning_rate": 2.176065692549209e-05, + "loss": 0.4921, + "step": 4677, + "task_loss": 0.6777297854423523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5213508009910583, + "epoch": 3.95, + "learning_rate": 2.175461900736626e-05, + "loss": 0.5211, + "step": 4678, + "task_loss": 1.1930798292160034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6845307946205139, + "epoch": 3.95, + "learning_rate": 2.174858108924043e-05, + "loss": 0.4987, + "step": 4679, + "task_loss": 0.8212258219718933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.718172013759613, + "epoch": 3.96, + "learning_rate": 2.1742543171114602e-05, + "loss": 0.6545, + "step": 4680, + "task_loss": 0.7680811882019043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.523149847984314, + "epoch": 3.96, + "learning_rate": 2.173650525298877e-05, + "loss": 0.8008, + "step": 4681, + "task_loss": 0.34052520990371704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5815091729164124, + "epoch": 3.96, + "learning_rate": 2.173046733486294e-05, + "loss": 0.5677, + "step": 4682, + "task_loss": 0.6031589508056641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4138781726360321, + "epoch": 3.96, + "learning_rate": 2.172442941673711e-05, + "loss": 0.5525, + "step": 4683, + "task_loss": 0.6251529455184937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5561714768409729, + "epoch": 3.96, + "learning_rate": 2.1718391498611278e-05, + "loss": 0.6177, + "step": 4684, + "task_loss": 1.3499115705490112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.247907817363739, + "epoch": 3.96, + "learning_rate": 2.1712353580485452e-05, + "loss": 0.4063, + "step": 4685, + "task_loss": 0.5688174366950989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5097026824951172, + "epoch": 3.96, + "learning_rate": 2.170631566235962e-05, + "loss": 0.574, + "step": 4686, + "task_loss": 0.8796315789222717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4184175729751587, + "epoch": 3.96, + "learning_rate": 2.1700277744233786e-05, + "loss": 0.4424, + "step": 4687, + "task_loss": 1.1460038423538208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.591058611869812, + "epoch": 3.96, + "learning_rate": 2.169423982610796e-05, + "loss": 0.592, + "step": 4688, + "task_loss": 0.3004400134086609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5734179019927979, + "epoch": 3.96, + "learning_rate": 2.1688201907982127e-05, + "loss": 0.6525, + "step": 4689, + "task_loss": 0.8673804402351379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5321282148361206, + "epoch": 3.96, + "learning_rate": 2.1682163989856298e-05, + "loss": 0.5689, + "step": 4690, + "task_loss": 0.2225162833929062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.639796257019043, + "epoch": 3.96, + "learning_rate": 2.167612607173047e-05, + "loss": 0.5784, + "step": 4691, + "task_loss": 1.452883005142212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7429713010787964, + "epoch": 3.97, + "learning_rate": 2.1670088153604636e-05, + "loss": 0.6937, + "step": 4692, + "task_loss": 0.8675745129585266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6331377029418945, + "epoch": 3.97, + "learning_rate": 2.166405023547881e-05, + "loss": 0.6601, + "step": 4693, + "task_loss": 0.9951923489570618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35508954524993896, + "epoch": 3.97, + "learning_rate": 2.1658012317352977e-05, + "loss": 0.5008, + "step": 4694, + "task_loss": 0.9431642889976501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.407417893409729, + "epoch": 3.97, + "learning_rate": 2.1651974399227148e-05, + "loss": 0.4959, + "step": 4695, + "task_loss": 0.3801063001155853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6112456917762756, + "epoch": 3.97, + "learning_rate": 2.1645936481101318e-05, + "loss": 0.5073, + "step": 4696, + "task_loss": 0.7563797235488892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.479740172624588, + "epoch": 3.97, + "learning_rate": 2.1639898562975485e-05, + "loss": 0.5015, + "step": 4697, + "task_loss": 0.4670616388320923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5717419385910034, + "epoch": 3.97, + "learning_rate": 2.1633860644849656e-05, + "loss": 0.4939, + "step": 4698, + "task_loss": 1.5216275453567505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5171864032745361, + "epoch": 3.97, + "learning_rate": 2.1627822726723826e-05, + "loss": 0.5194, + "step": 4699, + "task_loss": 0.7689891457557678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30437183380126953, + "epoch": 3.97, + "learning_rate": 2.1621784808597997e-05, + "loss": 0.4351, + "step": 4700, + "task_loss": 0.49128419160842896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.593900740146637, + "epoch": 3.97, + "learning_rate": 2.1615746890472168e-05, + "loss": 0.5984, + "step": 4701, + "task_loss": 0.7727730870246887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49488240480422974, + "epoch": 3.97, + "learning_rate": 2.1609708972346335e-05, + "loss": 0.6023, + "step": 4702, + "task_loss": 0.47161537408828735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7399104833602905, + "epoch": 3.97, + "learning_rate": 2.1603671054220505e-05, + "loss": 0.607, + "step": 4703, + "task_loss": 0.535976231098175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4552808403968811, + "epoch": 3.98, + "learning_rate": 2.1597633136094676e-05, + "loss": 0.717, + "step": 4704, + "task_loss": 0.17608077824115753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6849422454833984, + "epoch": 3.98, + "learning_rate": 2.1591595217968846e-05, + "loss": 0.6083, + "step": 4705, + "task_loss": 0.7298884391784668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6557953357696533, + "epoch": 3.98, + "learning_rate": 2.1585557299843014e-05, + "loss": 0.6359, + "step": 4706, + "task_loss": 0.9728128910064697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4499448835849762, + "epoch": 3.98, + "learning_rate": 2.1579519381717184e-05, + "loss": 0.4738, + "step": 4707, + "task_loss": 0.42878738045692444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5996905565261841, + "epoch": 3.98, + "learning_rate": 2.1573481463591355e-05, + "loss": 0.5986, + "step": 4708, + "task_loss": 0.5559734106063843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5977228879928589, + "epoch": 3.98, + "learning_rate": 2.1567443545465525e-05, + "loss": 0.626, + "step": 4709, + "task_loss": 0.6031299829483032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3838256001472473, + "epoch": 3.98, + "learning_rate": 2.1561405627339696e-05, + "loss": 0.6501, + "step": 4710, + "task_loss": 0.671137273311615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36928120255470276, + "epoch": 3.98, + "learning_rate": 2.1555367709213863e-05, + "loss": 0.4995, + "step": 4711, + "task_loss": 0.41919150948524475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7278931140899658, + "epoch": 3.98, + "learning_rate": 2.1549329791088034e-05, + "loss": 0.7055, + "step": 4712, + "task_loss": 0.9952042698860168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3223481774330139, + "epoch": 3.98, + "learning_rate": 2.1543291872962204e-05, + "loss": 0.4211, + "step": 4713, + "task_loss": 0.6666816473007202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5668814182281494, + "epoch": 3.98, + "learning_rate": 2.153725395483637e-05, + "loss": 0.8542, + "step": 4714, + "task_loss": 0.8340035676956177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5388627052307129, + "epoch": 3.99, + "learning_rate": 2.1531216036710545e-05, + "loss": 0.5364, + "step": 4715, + "task_loss": 0.45354506373405457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2979797124862671, + "epoch": 3.99, + "learning_rate": 2.1525178118584713e-05, + "loss": 0.6399, + "step": 4716, + "task_loss": 0.16478177905082703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9460223317146301, + "epoch": 3.99, + "learning_rate": 2.1519140200458883e-05, + "loss": 0.8317, + "step": 4717, + "task_loss": 0.5992939472198486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4957447946071625, + "epoch": 3.99, + "learning_rate": 2.1513102282333054e-05, + "loss": 0.5025, + "step": 4718, + "task_loss": 0.419078528881073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3905831575393677, + "epoch": 3.99, + "learning_rate": 2.150706436420722e-05, + "loss": 0.6132, + "step": 4719, + "task_loss": 0.4484780728816986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.519612729549408, + "epoch": 3.99, + "learning_rate": 2.1501026446081395e-05, + "loss": 0.5302, + "step": 4720, + "task_loss": 0.7195404767990112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40026116371154785, + "epoch": 3.99, + "learning_rate": 2.1494988527955562e-05, + "loss": 0.4523, + "step": 4721, + "task_loss": 0.5414677858352661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0485765933990479, + "epoch": 3.99, + "learning_rate": 2.148895060982973e-05, + "loss": 0.69, + "step": 4722, + "task_loss": 1.3995647430419922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42104777693748474, + "epoch": 3.99, + "learning_rate": 2.1482912691703903e-05, + "loss": 0.5349, + "step": 4723, + "task_loss": 0.7281877994537354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40247026085853577, + "epoch": 3.99, + "learning_rate": 2.147687477357807e-05, + "loss": 0.5937, + "step": 4724, + "task_loss": 0.48326680064201355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6824438571929932, + "epoch": 3.99, + "learning_rate": 2.147083685545224e-05, + "loss": 0.6484, + "step": 4725, + "task_loss": 0.3684065043926239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6422073841094971, + "epoch": 3.99, + "learning_rate": 2.146479893732641e-05, + "loss": 0.5023, + "step": 4726, + "task_loss": 0.33097705245018005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.804389476776123, + "epoch": 4.0, + "learning_rate": 2.145876101920058e-05, + "loss": 0.6558, + "step": 4727, + "task_loss": 1.2167619466781616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6194782257080078, + "epoch": 4.0, + "learning_rate": 2.145272310107475e-05, + "loss": 0.5813, + "step": 4728, + "task_loss": 1.4933849573135376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1649617850780487, + "epoch": 4.0, + "learning_rate": 2.144668518294892e-05, + "loss": 0.454, + "step": 4729, + "task_loss": 0.09604117274284363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7983605861663818, + "epoch": 4.0, + "learning_rate": 2.144064726482309e-05, + "loss": 0.8197, + "step": 4730, + "task_loss": 1.3449786901474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7365612983703613, + "epoch": 4.0, + "learning_rate": 2.143460934669726e-05, + "loss": 0.6017, + "step": 4731, + "task_loss": 0.7054152488708496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5873042941093445, + "epoch": 4.0, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.5771, + "step": 4732, + "task_loss": 0.5025756359100342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31387650966644287, + "epoch": 4.0, + "learning_rate": 2.14225335104456e-05, + "loss": 0.6364, + "step": 4733, + "task_loss": 0.7157849073410034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5177661180496216, + "epoch": 4.0, + "learning_rate": 2.141649559231977e-05, + "loss": 0.5709, + "step": 4734, + "task_loss": 1.0256260633468628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3835583031177521, + "epoch": 4.0, + "learning_rate": 2.141045767419394e-05, + "loss": 0.5284, + "step": 4735, + "task_loss": 1.4411654472351074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5911240577697754, + "epoch": 4.0, + "learning_rate": 2.1404419756068107e-05, + "loss": 0.5764, + "step": 4736, + "task_loss": 1.5805437564849854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49940869212150574, + "epoch": 4.0, + "learning_rate": 2.1398381837942278e-05, + "loss": 0.5287, + "step": 4737, + "task_loss": 0.33925512433052063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5766795873641968, + "epoch": 4.01, + "learning_rate": 2.1392343919816448e-05, + "loss": 0.5163, + "step": 4738, + "task_loss": 0.5510725975036621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6070358157157898, + "epoch": 4.01, + "learning_rate": 2.138630600169062e-05, + "loss": 0.6029, + "step": 4739, + "task_loss": 0.365557461977005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4499216079711914, + "epoch": 4.01, + "learning_rate": 2.138026808356479e-05, + "loss": 0.4547, + "step": 4740, + "task_loss": 0.28514420986175537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3616742193698883, + "epoch": 4.01, + "learning_rate": 2.1374230165438957e-05, + "loss": 0.5692, + "step": 4741, + "task_loss": 0.3485644459724426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5094785690307617, + "epoch": 4.01, + "learning_rate": 2.1368192247313127e-05, + "loss": 0.5021, + "step": 4742, + "task_loss": 0.1370406448841095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40034934878349304, + "epoch": 4.01, + "learning_rate": 2.1362154329187298e-05, + "loss": 0.525, + "step": 4743, + "task_loss": 0.055064428597688675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38077637553215027, + "epoch": 4.01, + "learning_rate": 2.1356116411061465e-05, + "loss": 0.7244, + "step": 4744, + "task_loss": 0.43246057629585266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38661447167396545, + "epoch": 4.01, + "learning_rate": 2.135007849293564e-05, + "loss": 0.4942, + "step": 4745, + "task_loss": 0.49314892292022705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.415460467338562, + "epoch": 4.01, + "learning_rate": 2.1344040574809806e-05, + "loss": 0.4116, + "step": 4746, + "task_loss": 0.2467094361782074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.771670401096344, + "epoch": 4.01, + "learning_rate": 2.1338002656683977e-05, + "loss": 0.7333, + "step": 4747, + "task_loss": 0.1999107003211975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5362812280654907, + "epoch": 4.01, + "learning_rate": 2.1331964738558147e-05, + "loss": 0.7488, + "step": 4748, + "task_loss": 0.747908353805542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1375824213027954, + "epoch": 4.01, + "learning_rate": 2.1325926820432314e-05, + "loss": 0.6935, + "step": 4749, + "task_loss": 0.7045477032661438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5673202276229858, + "epoch": 4.02, + "learning_rate": 2.1319888902306488e-05, + "loss": 0.5424, + "step": 4750, + "task_loss": 0.77134770154953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6390001177787781, + "epoch": 4.02, + "learning_rate": 2.1313850984180655e-05, + "loss": 0.5799, + "step": 4751, + "task_loss": 0.3354516625404358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4571671485900879, + "epoch": 4.02, + "learning_rate": 2.1307813066054823e-05, + "loss": 0.6341, + "step": 4752, + "task_loss": 0.7684768438339233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4873442053794861, + "epoch": 4.02, + "learning_rate": 2.1301775147928997e-05, + "loss": 0.6232, + "step": 4753, + "task_loss": 0.7417385578155518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5556789636611938, + "epoch": 4.02, + "learning_rate": 2.1295737229803164e-05, + "loss": 0.8175, + "step": 4754, + "task_loss": 0.06997604668140411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6510725021362305, + "epoch": 4.02, + "learning_rate": 2.1289699311677334e-05, + "loss": 0.6165, + "step": 4755, + "task_loss": 0.664560854434967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6853417158126831, + "epoch": 4.02, + "learning_rate": 2.1283661393551505e-05, + "loss": 0.7159, + "step": 4756, + "task_loss": 0.7184975147247314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7224293351173401, + "epoch": 4.02, + "learning_rate": 2.1277623475425672e-05, + "loss": 0.5319, + "step": 4757, + "task_loss": 1.2513842582702637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.692254900932312, + "epoch": 4.02, + "learning_rate": 2.1271585557299846e-05, + "loss": 0.634, + "step": 4758, + "task_loss": 0.5582863688468933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47624650597572327, + "epoch": 4.02, + "learning_rate": 2.1265547639174013e-05, + "loss": 0.4892, + "step": 4759, + "task_loss": 0.6942484378814697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36005699634552, + "epoch": 4.02, + "learning_rate": 2.1259509721048184e-05, + "loss": 0.4479, + "step": 4760, + "task_loss": 0.3934752643108368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0560712814331055, + "epoch": 4.02, + "learning_rate": 2.1253471802922354e-05, + "loss": 0.5956, + "step": 4761, + "task_loss": 1.5140043497085571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4561522603034973, + "epoch": 4.03, + "learning_rate": 2.124743388479652e-05, + "loss": 0.5798, + "step": 4762, + "task_loss": 0.7723429203033447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5672728419303894, + "epoch": 4.03, + "learning_rate": 2.1241395966670692e-05, + "loss": 0.5252, + "step": 4763, + "task_loss": 0.5084905028343201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5851724147796631, + "epoch": 4.03, + "learning_rate": 2.1235358048544863e-05, + "loss": 0.5389, + "step": 4764, + "task_loss": 0.5211634039878845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7486690282821655, + "epoch": 4.03, + "learning_rate": 2.1229320130419033e-05, + "loss": 0.5966, + "step": 4765, + "task_loss": 0.6716009378433228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2804730534553528, + "epoch": 4.03, + "learning_rate": 2.1223282212293204e-05, + "loss": 0.4164, + "step": 4766, + "task_loss": 0.819311261177063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5420080423355103, + "epoch": 4.03, + "learning_rate": 2.121724429416737e-05, + "loss": 0.5266, + "step": 4767, + "task_loss": 0.06789326667785645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3944246768951416, + "epoch": 4.03, + "learning_rate": 2.121120637604154e-05, + "loss": 0.353, + "step": 4768, + "task_loss": 0.5486432909965515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35936596989631653, + "epoch": 4.03, + "learning_rate": 2.1205168457915712e-05, + "loss": 0.3474, + "step": 4769, + "task_loss": 0.2861657440662384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6819813251495361, + "epoch": 4.03, + "learning_rate": 2.1199130539789883e-05, + "loss": 0.725, + "step": 4770, + "task_loss": 0.4608876407146454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23729769885540009, + "epoch": 4.03, + "learning_rate": 2.119309262166405e-05, + "loss": 0.4699, + "step": 4771, + "task_loss": 0.1860072761774063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2965691089630127, + "epoch": 4.03, + "learning_rate": 2.118705470353822e-05, + "loss": 0.4626, + "step": 4772, + "task_loss": 0.4933810234069824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1424657106399536, + "epoch": 4.03, + "learning_rate": 2.118101678541239e-05, + "loss": 0.6838, + "step": 4773, + "task_loss": 1.276176929473877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4051796793937683, + "epoch": 4.04, + "learning_rate": 2.117497886728656e-05, + "loss": 0.712, + "step": 4774, + "task_loss": 0.2759948670864105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5624643564224243, + "epoch": 4.04, + "learning_rate": 2.1168940949160732e-05, + "loss": 0.5137, + "step": 4775, + "task_loss": 0.8818033933639526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37638020515441895, + "epoch": 4.04, + "learning_rate": 2.11629030310349e-05, + "loss": 0.5196, + "step": 4776, + "task_loss": 0.5820809006690979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3399655818939209, + "epoch": 4.04, + "learning_rate": 2.115686511290907e-05, + "loss": 0.5017, + "step": 4777, + "task_loss": 1.0989539623260498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4700782597064972, + "epoch": 4.04, + "learning_rate": 2.115082719478324e-05, + "loss": 0.8396, + "step": 4778, + "task_loss": 0.819175660610199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39835840463638306, + "epoch": 4.04, + "learning_rate": 2.1144789276657408e-05, + "loss": 0.4798, + "step": 4779, + "task_loss": 1.0504848957061768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2795675992965698, + "epoch": 4.04, + "learning_rate": 2.113875135853158e-05, + "loss": 0.6328, + "step": 4780, + "task_loss": 0.7517013549804688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5812097787857056, + "epoch": 4.04, + "learning_rate": 2.113271344040575e-05, + "loss": 0.552, + "step": 4781, + "task_loss": 0.739239513874054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5383638143539429, + "epoch": 4.04, + "learning_rate": 2.112667552227992e-05, + "loss": 0.7043, + "step": 4782, + "task_loss": 0.40152570605278015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5513266921043396, + "epoch": 4.04, + "learning_rate": 2.112063760415409e-05, + "loss": 0.4508, + "step": 4783, + "task_loss": 0.6069225668907166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2666395306587219, + "epoch": 4.04, + "learning_rate": 2.1114599686028257e-05, + "loss": 0.4327, + "step": 4784, + "task_loss": 0.3989448845386505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4584823548793793, + "epoch": 4.04, + "learning_rate": 2.110856176790243e-05, + "loss": 0.4622, + "step": 4785, + "task_loss": 0.3689836859703064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6570338606834412, + "epoch": 4.05, + "learning_rate": 2.11025238497766e-05, + "loss": 0.5397, + "step": 4786, + "task_loss": 0.9169428944587708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4032033383846283, + "epoch": 4.05, + "learning_rate": 2.1096485931650766e-05, + "loss": 0.6497, + "step": 4787, + "task_loss": 0.8290825486183167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6730924844741821, + "epoch": 4.05, + "learning_rate": 2.109044801352494e-05, + "loss": 0.5493, + "step": 4788, + "task_loss": 0.7846968173980713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4957803785800934, + "epoch": 4.05, + "learning_rate": 2.1084410095399107e-05, + "loss": 0.5058, + "step": 4789, + "task_loss": 0.3049405515193939 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6287276148796082, + "epoch": 4.05, + "learning_rate": 2.1078372177273277e-05, + "loss": 0.7166, + "step": 4790, + "task_loss": 0.5100951790809631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2914942800998688, + "epoch": 4.05, + "learning_rate": 2.1072334259147448e-05, + "loss": 0.4147, + "step": 4791, + "task_loss": 0.25010931491851807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32795974612236023, + "epoch": 4.05, + "learning_rate": 2.1066296341021615e-05, + "loss": 0.476, + "step": 4792, + "task_loss": 0.6992529630661011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3864227533340454, + "epoch": 4.05, + "learning_rate": 2.1060258422895786e-05, + "loss": 0.4805, + "step": 4793, + "task_loss": 0.4145110547542572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3530798554420471, + "epoch": 4.05, + "learning_rate": 2.1054220504769956e-05, + "loss": 0.3907, + "step": 4794, + "task_loss": 0.06814588606357574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28139442205429077, + "epoch": 4.05, + "learning_rate": 2.1048182586644127e-05, + "loss": 0.49, + "step": 4795, + "task_loss": 0.0899096354842186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6951201558113098, + "epoch": 4.05, + "learning_rate": 2.1042144668518297e-05, + "loss": 0.5916, + "step": 4796, + "task_loss": 1.1115447282791138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5278486013412476, + "epoch": 4.05, + "learning_rate": 2.1036106750392464e-05, + "loss": 0.5752, + "step": 4797, + "task_loss": 1.0248665809631348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40249085426330566, + "epoch": 4.06, + "learning_rate": 2.1030068832266635e-05, + "loss": 0.5342, + "step": 4798, + "task_loss": 0.4114725589752197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6014667749404907, + "epoch": 4.06, + "learning_rate": 2.1024030914140806e-05, + "loss": 0.4923, + "step": 4799, + "task_loss": 0.8599461317062378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5211051106452942, + "epoch": 4.06, + "learning_rate": 2.1017992996014976e-05, + "loss": 0.5223, + "step": 4800, + "task_loss": 0.17390498518943787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5792571306228638, + "epoch": 4.06, + "learning_rate": 2.1011955077889143e-05, + "loss": 0.5304, + "step": 4801, + "task_loss": 0.9854143857955933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32039448618888855, + "epoch": 4.06, + "learning_rate": 2.1005917159763314e-05, + "loss": 0.6038, + "step": 4802, + "task_loss": 0.13671576976776123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37806111574172974, + "epoch": 4.06, + "learning_rate": 2.0999879241637484e-05, + "loss": 0.5315, + "step": 4803, + "task_loss": 0.45222413539886475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39244237542152405, + "epoch": 4.06, + "learning_rate": 2.0993841323511655e-05, + "loss": 0.5848, + "step": 4804, + "task_loss": 0.9736164808273315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19975732266902924, + "epoch": 4.06, + "learning_rate": 2.0987803405385826e-05, + "loss": 0.4394, + "step": 4805, + "task_loss": 0.40974661707878113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31402015686035156, + "epoch": 4.06, + "learning_rate": 2.0981765487259993e-05, + "loss": 0.5291, + "step": 4806, + "task_loss": 0.5116965174674988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6494240164756775, + "epoch": 4.06, + "learning_rate": 2.0975727569134163e-05, + "loss": 0.657, + "step": 4807, + "task_loss": 0.10465505719184875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5152121782302856, + "epoch": 4.06, + "learning_rate": 2.0969689651008334e-05, + "loss": 0.6183, + "step": 4808, + "task_loss": 0.791029155254364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.778637707233429, + "epoch": 4.07, + "learning_rate": 2.09636517328825e-05, + "loss": 0.5603, + "step": 4809, + "task_loss": 1.1276065111160278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4197838306427002, + "epoch": 4.07, + "learning_rate": 2.0957613814756675e-05, + "loss": 0.5307, + "step": 4810, + "task_loss": 0.2402072250843048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2966751158237457, + "epoch": 4.07, + "learning_rate": 2.0951575896630842e-05, + "loss": 0.4933, + "step": 4811, + "task_loss": 0.676371157169342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35055047273635864, + "epoch": 4.07, + "learning_rate": 2.0945537978505013e-05, + "loss": 0.4212, + "step": 4812, + "task_loss": 1.3238452672958374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5209638476371765, + "epoch": 4.07, + "learning_rate": 2.0939500060379183e-05, + "loss": 0.5894, + "step": 4813, + "task_loss": 0.8793561458587646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4854776859283447, + "epoch": 4.07, + "learning_rate": 2.093346214225335e-05, + "loss": 0.5731, + "step": 4814, + "task_loss": 1.1083720922470093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44690054655075073, + "epoch": 4.07, + "learning_rate": 2.0927424224127525e-05, + "loss": 0.5348, + "step": 4815, + "task_loss": 0.1143370270729065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3003236651420593, + "epoch": 4.07, + "learning_rate": 2.0921386306001692e-05, + "loss": 0.3989, + "step": 4816, + "task_loss": 0.2688910961151123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5574758052825928, + "epoch": 4.07, + "learning_rate": 2.091534838787586e-05, + "loss": 0.6373, + "step": 4817, + "task_loss": 0.9178678393363953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4432816207408905, + "epoch": 4.07, + "learning_rate": 2.0909310469750033e-05, + "loss": 0.4265, + "step": 4818, + "task_loss": 0.5627980828285217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5458700656890869, + "epoch": 4.07, + "learning_rate": 2.09032725516242e-05, + "loss": 0.4196, + "step": 4819, + "task_loss": 0.45111384987831116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4634338617324829, + "epoch": 4.07, + "learning_rate": 2.089723463349837e-05, + "loss": 0.4065, + "step": 4820, + "task_loss": 0.3636632263660431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45153355598449707, + "epoch": 4.08, + "learning_rate": 2.089119671537254e-05, + "loss": 0.4754, + "step": 4821, + "task_loss": 0.6986990571022034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3073423206806183, + "epoch": 4.08, + "learning_rate": 2.088515879724671e-05, + "loss": 0.6625, + "step": 4822, + "task_loss": 0.522067129611969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48834651708602905, + "epoch": 4.08, + "learning_rate": 2.0879120879120882e-05, + "loss": 0.5237, + "step": 4823, + "task_loss": 0.5856226086616516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3686964511871338, + "epoch": 4.08, + "learning_rate": 2.087308296099505e-05, + "loss": 0.5105, + "step": 4824, + "task_loss": 0.3028053045272827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46363118290901184, + "epoch": 4.08, + "learning_rate": 2.086704504286922e-05, + "loss": 0.5166, + "step": 4825, + "task_loss": 0.8139827251434326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6916056275367737, + "epoch": 4.08, + "learning_rate": 2.086100712474339e-05, + "loss": 0.4746, + "step": 4826, + "task_loss": 0.5807672142982483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3776167631149292, + "epoch": 4.08, + "learning_rate": 2.0854969206617558e-05, + "loss": 0.4624, + "step": 4827, + "task_loss": 0.39206063747406006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4769865870475769, + "epoch": 4.08, + "learning_rate": 2.084893128849173e-05, + "loss": 0.6137, + "step": 4828, + "task_loss": 0.5209089517593384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36051177978515625, + "epoch": 4.08, + "learning_rate": 2.08428933703659e-05, + "loss": 0.451, + "step": 4829, + "task_loss": 0.4639691710472107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5690087676048279, + "epoch": 4.08, + "learning_rate": 2.083685545224007e-05, + "loss": 0.5595, + "step": 4830, + "task_loss": 0.8772652745246887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33982405066490173, + "epoch": 4.08, + "learning_rate": 2.083081753411424e-05, + "loss": 0.577, + "step": 4831, + "task_loss": 0.7649607062339783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5056725144386292, + "epoch": 4.08, + "learning_rate": 2.0824779615988407e-05, + "loss": 0.5053, + "step": 4832, + "task_loss": 0.9000107645988464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4725050926208496, + "epoch": 4.09, + "learning_rate": 2.0818741697862578e-05, + "loss": 0.662, + "step": 4833, + "task_loss": 1.650697946548462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43998655676841736, + "epoch": 4.09, + "learning_rate": 2.081270377973675e-05, + "loss": 0.5597, + "step": 4834, + "task_loss": 0.6790078282356262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34183740615844727, + "epoch": 4.09, + "learning_rate": 2.080666586161092e-05, + "loss": 0.3536, + "step": 4835, + "task_loss": 0.6885197162628174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8613196611404419, + "epoch": 4.09, + "learning_rate": 2.0800627943485086e-05, + "loss": 0.6661, + "step": 4836, + "task_loss": 1.4692519903182983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42925214767456055, + "epoch": 4.09, + "learning_rate": 2.0794590025359257e-05, + "loss": 0.561, + "step": 4837, + "task_loss": 0.8453882932662964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5459936857223511, + "epoch": 4.09, + "learning_rate": 2.0788552107233427e-05, + "loss": 0.7471, + "step": 4838, + "task_loss": 1.9009873867034912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7726955413818359, + "epoch": 4.09, + "learning_rate": 2.0782514189107598e-05, + "loss": 0.6079, + "step": 4839, + "task_loss": 0.6810758709907532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6751399040222168, + "epoch": 4.09, + "learning_rate": 2.077647627098177e-05, + "loss": 0.7232, + "step": 4840, + "task_loss": 0.5369904637336731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43171146512031555, + "epoch": 4.09, + "learning_rate": 2.0770438352855936e-05, + "loss": 0.5431, + "step": 4841, + "task_loss": 0.851557195186615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6060367822647095, + "epoch": 4.09, + "learning_rate": 2.0764400434730106e-05, + "loss": 0.6636, + "step": 4842, + "task_loss": 0.4953901469707489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31919586658477783, + "epoch": 4.09, + "learning_rate": 2.0758362516604277e-05, + "loss": 0.4883, + "step": 4843, + "task_loss": 0.8588495850563049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6748343110084534, + "epoch": 4.09, + "learning_rate": 2.0752324598478444e-05, + "loss": 0.6488, + "step": 4844, + "task_loss": 0.9404436945915222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5297982096672058, + "epoch": 4.1, + "learning_rate": 2.0746286680352618e-05, + "loss": 0.476, + "step": 4845, + "task_loss": 0.9502042531967163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4688900411128998, + "epoch": 4.1, + "learning_rate": 2.0740248762226785e-05, + "loss": 0.6395, + "step": 4846, + "task_loss": 0.7039148807525635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6190444231033325, + "epoch": 4.1, + "learning_rate": 2.0734210844100956e-05, + "loss": 0.4493, + "step": 4847, + "task_loss": 0.43178680539131165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.537518322467804, + "epoch": 4.1, + "learning_rate": 2.0728172925975126e-05, + "loss": 0.4935, + "step": 4848, + "task_loss": 0.8373633623123169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6106147170066833, + "epoch": 4.1, + "learning_rate": 2.0722135007849293e-05, + "loss": 0.5937, + "step": 4849, + "task_loss": 0.6050535440444946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5739820599555969, + "epoch": 4.1, + "learning_rate": 2.0716097089723464e-05, + "loss": 0.5613, + "step": 4850, + "task_loss": 0.784114420413971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2724568545818329, + "epoch": 4.1, + "learning_rate": 2.0710059171597635e-05, + "loss": 0.3785, + "step": 4851, + "task_loss": 0.28200775384902954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3734396994113922, + "epoch": 4.1, + "learning_rate": 2.0704021253471802e-05, + "loss": 0.5408, + "step": 4852, + "task_loss": 0.48892220854759216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3134028911590576, + "epoch": 4.1, + "learning_rate": 2.0697983335345976e-05, + "loss": 0.4789, + "step": 4853, + "task_loss": 0.2794402837753296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6287461519241333, + "epoch": 4.1, + "learning_rate": 2.0691945417220143e-05, + "loss": 0.5876, + "step": 4854, + "task_loss": 0.2488531470298767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44084808230400085, + "epoch": 4.1, + "learning_rate": 2.0685907499094314e-05, + "loss": 0.4973, + "step": 4855, + "task_loss": 0.30437490344047546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4538155198097229, + "epoch": 4.1, + "learning_rate": 2.0679869580968484e-05, + "loss": 0.4641, + "step": 4856, + "task_loss": 0.35117271542549133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5552226901054382, + "epoch": 4.11, + "learning_rate": 2.067383166284265e-05, + "loss": 0.5211, + "step": 4857, + "task_loss": 0.6853883266448975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9495399594306946, + "epoch": 4.11, + "learning_rate": 2.0667793744716822e-05, + "loss": 0.6362, + "step": 4858, + "task_loss": 1.8198763132095337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4911735951900482, + "epoch": 4.11, + "learning_rate": 2.0661755826590992e-05, + "loss": 0.5015, + "step": 4859, + "task_loss": 0.7053030729293823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7839615941047668, + "epoch": 4.11, + "learning_rate": 2.065571790846516e-05, + "loss": 0.7798, + "step": 4860, + "task_loss": 0.3576664626598358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7067352533340454, + "epoch": 4.11, + "learning_rate": 2.0649679990339334e-05, + "loss": 0.61, + "step": 4861, + "task_loss": 1.1211578845977783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47274214029312134, + "epoch": 4.11, + "learning_rate": 2.06436420722135e-05, + "loss": 0.5948, + "step": 4862, + "task_loss": 1.039860486984253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5427668690681458, + "epoch": 4.11, + "learning_rate": 2.063760415408767e-05, + "loss": 0.5503, + "step": 4863, + "task_loss": 1.6784418821334839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36723172664642334, + "epoch": 4.11, + "learning_rate": 2.0631566235961842e-05, + "loss": 0.4019, + "step": 4864, + "task_loss": 0.6649518013000488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2425124645233154, + "epoch": 4.11, + "learning_rate": 2.062552831783601e-05, + "loss": 0.7267, + "step": 4865, + "task_loss": 0.867608368396759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36438655853271484, + "epoch": 4.11, + "learning_rate": 2.061949039971018e-05, + "loss": 0.5222, + "step": 4866, + "task_loss": 0.6334699392318726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.812321126461029, + "epoch": 4.11, + "learning_rate": 2.061345248158435e-05, + "loss": 0.5441, + "step": 4867, + "task_loss": 1.1545593738555908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.555686891078949, + "epoch": 4.11, + "learning_rate": 2.060741456345852e-05, + "loss": 0.6475, + "step": 4868, + "task_loss": 1.6450722217559814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24309560656547546, + "epoch": 4.12, + "learning_rate": 2.060137664533269e-05, + "loss": 0.4101, + "step": 4869, + "task_loss": 0.08592002093791962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7391124367713928, + "epoch": 4.12, + "learning_rate": 2.059533872720686e-05, + "loss": 0.7967, + "step": 4870, + "task_loss": 0.9574471116065979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6637915372848511, + "epoch": 4.12, + "learning_rate": 2.058930080908103e-05, + "loss": 0.6258, + "step": 4871, + "task_loss": 0.572775661945343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.414417028427124, + "epoch": 4.12, + "learning_rate": 2.05832628909552e-05, + "loss": 0.5254, + "step": 4872, + "task_loss": 0.8022780418395996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5420283079147339, + "epoch": 4.12, + "learning_rate": 2.057722497282937e-05, + "loss": 0.4529, + "step": 4873, + "task_loss": 0.3423026204109192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5352892279624939, + "epoch": 4.12, + "learning_rate": 2.0571187054703537e-05, + "loss": 0.4552, + "step": 4874, + "task_loss": 0.8503915667533875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40901365876197815, + "epoch": 4.12, + "learning_rate": 2.0565149136577708e-05, + "loss": 0.4971, + "step": 4875, + "task_loss": 0.33442750573158264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.81807941198349, + "epoch": 4.12, + "learning_rate": 2.055911121845188e-05, + "loss": 0.5126, + "step": 4876, + "task_loss": 0.9090336561203003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5702627897262573, + "epoch": 4.12, + "learning_rate": 2.055307330032605e-05, + "loss": 0.4937, + "step": 4877, + "task_loss": 0.8171088695526123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5972639322280884, + "epoch": 4.12, + "learning_rate": 2.054703538220022e-05, + "loss": 0.5401, + "step": 4878, + "task_loss": 0.491543173789978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21900400519371033, + "epoch": 4.12, + "learning_rate": 2.0540997464074387e-05, + "loss": 0.5876, + "step": 4879, + "task_loss": 0.3664439022541046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5002795457839966, + "epoch": 4.13, + "learning_rate": 2.0534959545948557e-05, + "loss": 0.5045, + "step": 4880, + "task_loss": 0.2700909972190857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3265456557273865, + "epoch": 4.13, + "learning_rate": 2.0528921627822728e-05, + "loss": 0.437, + "step": 4881, + "task_loss": 0.13341550529003143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5095371603965759, + "epoch": 4.13, + "learning_rate": 2.0522883709696895e-05, + "loss": 0.613, + "step": 4882, + "task_loss": 1.263572335243225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4166337847709656, + "epoch": 4.13, + "learning_rate": 2.051684579157107e-05, + "loss": 0.4706, + "step": 4883, + "task_loss": 1.2293126583099365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7518548965454102, + "epoch": 4.13, + "learning_rate": 2.0510807873445236e-05, + "loss": 0.6321, + "step": 4884, + "task_loss": 0.7778270244598389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5106979608535767, + "epoch": 4.13, + "learning_rate": 2.0504769955319407e-05, + "loss": 0.5266, + "step": 4885, + "task_loss": 0.1402025818824768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48517274856567383, + "epoch": 4.13, + "learning_rate": 2.0498732037193578e-05, + "loss": 0.5374, + "step": 4886, + "task_loss": 0.31376367807388306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39052340388298035, + "epoch": 4.13, + "learning_rate": 2.0492694119067745e-05, + "loss": 0.4256, + "step": 4887, + "task_loss": 0.21021248400211334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5862571597099304, + "epoch": 4.13, + "learning_rate": 2.048665620094192e-05, + "loss": 0.6384, + "step": 4888, + "task_loss": 0.933180034160614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3617023825645447, + "epoch": 4.13, + "learning_rate": 2.0480618282816086e-05, + "loss": 0.4606, + "step": 4889, + "task_loss": 0.08401191234588623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5172408223152161, + "epoch": 4.13, + "learning_rate": 2.0474580364690253e-05, + "loss": 0.6933, + "step": 4890, + "task_loss": 1.0569700002670288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38278430700302124, + "epoch": 4.13, + "learning_rate": 2.0468542446564427e-05, + "loss": 0.5293, + "step": 4891, + "task_loss": 0.6304617524147034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5206208825111389, + "epoch": 4.14, + "learning_rate": 2.0462504528438594e-05, + "loss": 0.6246, + "step": 4892, + "task_loss": 0.6471975445747375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7046142816543579, + "epoch": 4.14, + "learning_rate": 2.0456466610312765e-05, + "loss": 0.6234, + "step": 4893, + "task_loss": 1.216585397720337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6257479190826416, + "epoch": 4.14, + "learning_rate": 2.0450428692186935e-05, + "loss": 0.5103, + "step": 4894, + "task_loss": 0.7435041666030884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4277074337005615, + "epoch": 4.14, + "learning_rate": 2.0444390774061102e-05, + "loss": 0.4981, + "step": 4895, + "task_loss": 0.955923855304718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7934571504592896, + "epoch": 4.14, + "learning_rate": 2.0438352855935276e-05, + "loss": 0.4517, + "step": 4896, + "task_loss": 0.8734060525894165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4151521325111389, + "epoch": 4.14, + "learning_rate": 2.0432314937809444e-05, + "loss": 0.7062, + "step": 4897, + "task_loss": 1.0998998880386353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5007086992263794, + "epoch": 4.14, + "learning_rate": 2.0426277019683614e-05, + "loss": 0.7064, + "step": 4898, + "task_loss": 0.955003559589386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3017944097518921, + "epoch": 4.14, + "learning_rate": 2.0420239101557785e-05, + "loss": 0.4384, + "step": 4899, + "task_loss": 0.18417111039161682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41983672976493835, + "epoch": 4.14, + "learning_rate": 2.0414201183431952e-05, + "loss": 0.5696, + "step": 4900, + "task_loss": 0.5690034031867981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7621749043464661, + "epoch": 4.14, + "learning_rate": 2.0408163265306123e-05, + "loss": 0.5547, + "step": 4901, + "task_loss": 1.0805693864822388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27504539489746094, + "epoch": 4.14, + "learning_rate": 2.0402125347180293e-05, + "loss": 0.4971, + "step": 4902, + "task_loss": 0.8569192290306091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31197261810302734, + "epoch": 4.14, + "learning_rate": 2.0396087429054464e-05, + "loss": 0.4704, + "step": 4903, + "task_loss": 0.4006117880344391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2562114894390106, + "epoch": 4.15, + "learning_rate": 2.0390049510928634e-05, + "loss": 0.3859, + "step": 4904, + "task_loss": 0.11043187230825424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7524648308753967, + "epoch": 4.15, + "learning_rate": 2.03840115928028e-05, + "loss": 0.5424, + "step": 4905, + "task_loss": 0.6407849192619324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4935816526412964, + "epoch": 4.15, + "learning_rate": 2.0377973674676972e-05, + "loss": 0.6947, + "step": 4906, + "task_loss": 0.7342183589935303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4817306399345398, + "epoch": 4.15, + "learning_rate": 2.0371935756551143e-05, + "loss": 0.4713, + "step": 4907, + "task_loss": 0.3491319715976715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5387006402015686, + "epoch": 4.15, + "learning_rate": 2.0365897838425313e-05, + "loss": 0.6569, + "step": 4908, + "task_loss": 0.4180913269519806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5934556126594543, + "epoch": 4.15, + "learning_rate": 2.035985992029948e-05, + "loss": 0.6321, + "step": 4909, + "task_loss": 0.6348301768302917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40936630964279175, + "epoch": 4.15, + "learning_rate": 2.035382200217365e-05, + "loss": 0.5107, + "step": 4910, + "task_loss": 0.5561091899871826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6141155958175659, + "epoch": 4.15, + "learning_rate": 2.034778408404782e-05, + "loss": 0.5211, + "step": 4911, + "task_loss": 1.0218429565429688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4583278298377991, + "epoch": 4.15, + "learning_rate": 2.0341746165921992e-05, + "loss": 0.5145, + "step": 4912, + "task_loss": 0.7295549511909485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6116304397583008, + "epoch": 4.15, + "learning_rate": 2.0335708247796163e-05, + "loss": 0.5252, + "step": 4913, + "task_loss": 0.6079584956169128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7159029245376587, + "epoch": 4.15, + "learning_rate": 2.032967032967033e-05, + "loss": 0.5575, + "step": 4914, + "task_loss": 1.1247341632843018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31829237937927246, + "epoch": 4.15, + "learning_rate": 2.03236324115445e-05, + "loss": 0.419, + "step": 4915, + "task_loss": 0.061529599130153656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4920375943183899, + "epoch": 4.16, + "learning_rate": 2.031759449341867e-05, + "loss": 0.6514, + "step": 4916, + "task_loss": 0.36307623982429504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4403213858604431, + "epoch": 4.16, + "learning_rate": 2.0311556575292838e-05, + "loss": 0.5035, + "step": 4917, + "task_loss": 0.809432327747345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5061545372009277, + "epoch": 4.16, + "learning_rate": 2.0305518657167012e-05, + "loss": 0.5346, + "step": 4918, + "task_loss": 1.1466282606124878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37074413895606995, + "epoch": 4.16, + "learning_rate": 2.029948073904118e-05, + "loss": 0.522, + "step": 4919, + "task_loss": 0.4455549418926239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2901389002799988, + "epoch": 4.16, + "learning_rate": 2.029344282091535e-05, + "loss": 0.4272, + "step": 4920, + "task_loss": 0.6190131306648254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5332611799240112, + "epoch": 4.16, + "learning_rate": 2.028740490278952e-05, + "loss": 0.4635, + "step": 4921, + "task_loss": 0.5690762996673584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48544007539749146, + "epoch": 4.16, + "learning_rate": 2.0281366984663688e-05, + "loss": 0.5802, + "step": 4922, + "task_loss": 0.582472026348114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5117124319076538, + "epoch": 4.16, + "learning_rate": 2.0275329066537858e-05, + "loss": 0.5468, + "step": 4923, + "task_loss": 0.5030387043952942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5606534481048584, + "epoch": 4.16, + "learning_rate": 2.026929114841203e-05, + "loss": 0.4677, + "step": 4924, + "task_loss": 0.7098164558410645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25867798924446106, + "epoch": 4.16, + "learning_rate": 2.0263253230286196e-05, + "loss": 0.3534, + "step": 4925, + "task_loss": 0.11649671941995621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3853670358657837, + "epoch": 4.16, + "learning_rate": 2.025721531216037e-05, + "loss": 0.3969, + "step": 4926, + "task_loss": 0.48405909538269043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34250783920288086, + "epoch": 4.16, + "learning_rate": 2.0251177394034537e-05, + "loss": 0.5536, + "step": 4927, + "task_loss": 0.49599212408065796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4231536388397217, + "epoch": 4.17, + "learning_rate": 2.0245139475908708e-05, + "loss": 0.4616, + "step": 4928, + "task_loss": 0.1916409134864807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49538522958755493, + "epoch": 4.17, + "learning_rate": 2.0239101557782878e-05, + "loss": 0.5214, + "step": 4929, + "task_loss": 0.7686262726783752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5811537504196167, + "epoch": 4.17, + "learning_rate": 2.0233063639657045e-05, + "loss": 0.5252, + "step": 4930, + "task_loss": 0.23819835484027863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42883989214897156, + "epoch": 4.17, + "learning_rate": 2.0227025721531216e-05, + "loss": 0.595, + "step": 4931, + "task_loss": 1.3238552808761597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6786895990371704, + "epoch": 4.17, + "learning_rate": 2.0220987803405387e-05, + "loss": 0.667, + "step": 4932, + "task_loss": 1.0247772932052612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2992522418498993, + "epoch": 4.17, + "learning_rate": 2.0214949885279557e-05, + "loss": 0.3871, + "step": 4933, + "task_loss": 0.5979833006858826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25604087114334106, + "epoch": 4.17, + "learning_rate": 2.0208911967153728e-05, + "loss": 0.6609, + "step": 4934, + "task_loss": 0.09765611588954926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6350706219673157, + "epoch": 4.17, + "learning_rate": 2.0202874049027895e-05, + "loss": 0.5367, + "step": 4935, + "task_loss": 0.7732728719711304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17263920605182648, + "epoch": 4.17, + "learning_rate": 2.0196836130902065e-05, + "loss": 0.4161, + "step": 4936, + "task_loss": 0.03403476998209953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38287174701690674, + "epoch": 4.17, + "learning_rate": 2.0190798212776236e-05, + "loss": 0.4248, + "step": 4937, + "task_loss": 0.8116239309310913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48531174659729004, + "epoch": 4.17, + "learning_rate": 2.0184760294650407e-05, + "loss": 0.5875, + "step": 4938, + "task_loss": 0.8728266954421997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2720785140991211, + "epoch": 4.17, + "learning_rate": 2.0178722376524574e-05, + "loss": 0.6214, + "step": 4939, + "task_loss": 0.6443714499473572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5548807978630066, + "epoch": 4.18, + "learning_rate": 2.0172684458398744e-05, + "loss": 0.5624, + "step": 4940, + "task_loss": 1.4689717292785645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4050292670726776, + "epoch": 4.18, + "learning_rate": 2.0166646540272915e-05, + "loss": 0.3926, + "step": 4941, + "task_loss": 0.4794938862323761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33319228887557983, + "epoch": 4.18, + "learning_rate": 2.0160608622147085e-05, + "loss": 0.4167, + "step": 4942, + "task_loss": 0.8639678359031677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5307009816169739, + "epoch": 4.18, + "learning_rate": 2.0154570704021256e-05, + "loss": 0.6053, + "step": 4943, + "task_loss": 0.6365676522254944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34072375297546387, + "epoch": 4.18, + "learning_rate": 2.0148532785895423e-05, + "loss": 0.4851, + "step": 4944, + "task_loss": 0.43860340118408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6105844974517822, + "epoch": 4.18, + "learning_rate": 2.0142494867769594e-05, + "loss": 0.4286, + "step": 4945, + "task_loss": 0.8512235879898071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3144579529762268, + "epoch": 4.18, + "learning_rate": 2.0136456949643764e-05, + "loss": 0.5254, + "step": 4946, + "task_loss": 0.11859353631734848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4497188925743103, + "epoch": 4.18, + "learning_rate": 2.013041903151793e-05, + "loss": 0.4552, + "step": 4947, + "task_loss": 0.5314205884933472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42237424850463867, + "epoch": 4.18, + "learning_rate": 2.0124381113392105e-05, + "loss": 0.4107, + "step": 4948, + "task_loss": 0.5615251064300537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4527356028556824, + "epoch": 4.18, + "learning_rate": 2.0118343195266273e-05, + "loss": 0.4336, + "step": 4949, + "task_loss": 1.429122805595398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.677120566368103, + "epoch": 4.18, + "learning_rate": 2.0112305277140443e-05, + "loss": 0.5756, + "step": 4950, + "task_loss": 0.7396681308746338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2748467028141022, + "epoch": 4.19, + "learning_rate": 2.0106267359014614e-05, + "loss": 0.4943, + "step": 4951, + "task_loss": 0.05191145837306976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38200289011001587, + "epoch": 4.19, + "learning_rate": 2.010022944088878e-05, + "loss": 0.4552, + "step": 4952, + "task_loss": 0.6278354525566101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4642336666584015, + "epoch": 4.19, + "learning_rate": 2.0094191522762955e-05, + "loss": 0.4749, + "step": 4953, + "task_loss": 1.0173221826553345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46911531686782837, + "epoch": 4.19, + "learning_rate": 2.0088153604637122e-05, + "loss": 0.5151, + "step": 4954, + "task_loss": 0.9429162740707397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6079760789871216, + "epoch": 4.19, + "learning_rate": 2.008211568651129e-05, + "loss": 0.6305, + "step": 4955, + "task_loss": 0.2932954430580139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0613927841186523, + "epoch": 4.19, + "learning_rate": 2.0076077768385463e-05, + "loss": 0.6236, + "step": 4956, + "task_loss": 0.8749066591262817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3994763195514679, + "epoch": 4.19, + "learning_rate": 2.007003985025963e-05, + "loss": 0.4237, + "step": 4957, + "task_loss": 0.1639009565114975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4584995210170746, + "epoch": 4.19, + "learning_rate": 2.00640019321338e-05, + "loss": 0.5563, + "step": 4958, + "task_loss": 0.36930835247039795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38101595640182495, + "epoch": 4.19, + "learning_rate": 2.005796401400797e-05, + "loss": 0.4647, + "step": 4959, + "task_loss": 0.29052144289016724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.581211507320404, + "epoch": 4.19, + "learning_rate": 2.005192609588214e-05, + "loss": 0.3978, + "step": 4960, + "task_loss": 0.3513124883174896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25656163692474365, + "epoch": 4.19, + "learning_rate": 2.0045888177756313e-05, + "loss": 0.4314, + "step": 4961, + "task_loss": 1.7514078617095947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7383733987808228, + "epoch": 4.19, + "learning_rate": 2.003985025963048e-05, + "loss": 0.5809, + "step": 4962, + "task_loss": 0.9347806572914124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42711612582206726, + "epoch": 4.2, + "learning_rate": 2.003381234150465e-05, + "loss": 0.6147, + "step": 4963, + "task_loss": 0.43052881956100464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34706443548202515, + "epoch": 4.2, + "learning_rate": 2.002777442337882e-05, + "loss": 0.5872, + "step": 4964, + "task_loss": 0.6177386045455933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39574432373046875, + "epoch": 4.2, + "learning_rate": 2.0021736505252988e-05, + "loss": 0.63, + "step": 4965, + "task_loss": 0.3420225977897644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43850213289260864, + "epoch": 4.2, + "learning_rate": 2.001569858712716e-05, + "loss": 0.5818, + "step": 4966, + "task_loss": 0.9004715085029602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5186364650726318, + "epoch": 4.2, + "learning_rate": 2.000966066900133e-05, + "loss": 0.6236, + "step": 4967, + "task_loss": 0.9594752788543701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24794182181358337, + "epoch": 4.2, + "learning_rate": 2.00036227508755e-05, + "loss": 0.4956, + "step": 4968, + "task_loss": 0.5048964023590088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2958824634552002, + "epoch": 4.2, + "learning_rate": 1.999758483274967e-05, + "loss": 0.6031, + "step": 4969, + "task_loss": 0.43613654375076294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9088079333305359, + "epoch": 4.2, + "learning_rate": 1.9991546914623838e-05, + "loss": 0.7615, + "step": 4970, + "task_loss": 1.6905773878097534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4514431357383728, + "epoch": 4.2, + "learning_rate": 1.9985508996498008e-05, + "loss": 0.3774, + "step": 4971, + "task_loss": 0.4469534754753113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4144880175590515, + "epoch": 4.2, + "learning_rate": 1.997947107837218e-05, + "loss": 0.6766, + "step": 4972, + "task_loss": 0.7785089015960693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7056358456611633, + "epoch": 4.2, + "learning_rate": 1.997343316024635e-05, + "loss": 0.6371, + "step": 4973, + "task_loss": 0.7132214903831482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5285404920578003, + "epoch": 4.2, + "learning_rate": 1.9967395242120517e-05, + "loss": 0.4511, + "step": 4974, + "task_loss": 1.0623445510864258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3632991313934326, + "epoch": 4.21, + "learning_rate": 1.9961357323994687e-05, + "loss": 0.5419, + "step": 4975, + "task_loss": 0.5024172067642212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4428093135356903, + "epoch": 4.21, + "learning_rate": 1.9955319405868858e-05, + "loss": 0.5169, + "step": 4976, + "task_loss": 0.7164888381958008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6598308682441711, + "epoch": 4.21, + "learning_rate": 1.994928148774303e-05, + "loss": 0.5541, + "step": 4977, + "task_loss": 1.258410930633545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7521320581436157, + "epoch": 4.21, + "learning_rate": 1.99432435696172e-05, + "loss": 0.5877, + "step": 4978, + "task_loss": 0.3530876934528351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6055967807769775, + "epoch": 4.21, + "learning_rate": 1.9937205651491366e-05, + "loss": 0.469, + "step": 4979, + "task_loss": 0.5825212597846985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37297290563583374, + "epoch": 4.21, + "learning_rate": 1.9931167733365537e-05, + "loss": 0.5484, + "step": 4980, + "task_loss": 0.5703855752944946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4025610685348511, + "epoch": 4.21, + "learning_rate": 1.9925129815239707e-05, + "loss": 0.7857, + "step": 4981, + "task_loss": 0.4340205490589142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.371643602848053, + "epoch": 4.21, + "learning_rate": 1.9919091897113874e-05, + "loss": 0.3604, + "step": 4982, + "task_loss": 0.9145194888114929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5458828210830688, + "epoch": 4.21, + "learning_rate": 1.991305397898805e-05, + "loss": 0.5892, + "step": 4983, + "task_loss": 0.7225217819213867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8708949089050293, + "epoch": 4.21, + "learning_rate": 1.9907016060862216e-05, + "loss": 0.6921, + "step": 4984, + "task_loss": 1.502190351486206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47971537709236145, + "epoch": 4.21, + "learning_rate": 1.9900978142736386e-05, + "loss": 0.423, + "step": 4985, + "task_loss": 0.708484411239624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44678863883018494, + "epoch": 4.21, + "learning_rate": 1.9894940224610557e-05, + "loss": 0.6514, + "step": 4986, + "task_loss": 0.6540032625198364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3461977243423462, + "epoch": 4.22, + "learning_rate": 1.9888902306484724e-05, + "loss": 0.4556, + "step": 4987, + "task_loss": 1.1603264808654785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7210103273391724, + "epoch": 4.22, + "learning_rate": 1.9882864388358894e-05, + "loss": 0.5274, + "step": 4988, + "task_loss": 1.5134752988815308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33187419176101685, + "epoch": 4.22, + "learning_rate": 1.9876826470233065e-05, + "loss": 0.5035, + "step": 4989, + "task_loss": 0.7476351857185364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.452039897441864, + "epoch": 4.22, + "learning_rate": 1.9870788552107232e-05, + "loss": 0.5199, + "step": 4990, + "task_loss": 1.0907161235809326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24417778849601746, + "epoch": 4.22, + "learning_rate": 1.9864750633981406e-05, + "loss": 0.4159, + "step": 4991, + "task_loss": 0.2452525645494461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42879533767700195, + "epoch": 4.22, + "learning_rate": 1.9858712715855573e-05, + "loss": 0.5038, + "step": 4992, + "task_loss": 0.6371445059776306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3104240298271179, + "epoch": 4.22, + "learning_rate": 1.9852674797729744e-05, + "loss": 0.4537, + "step": 4993, + "task_loss": 0.34255972504615784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5890994071960449, + "epoch": 4.22, + "learning_rate": 1.9846636879603914e-05, + "loss": 0.4542, + "step": 4994, + "task_loss": 1.377626895904541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5061273574829102, + "epoch": 4.22, + "learning_rate": 1.984059896147808e-05, + "loss": 0.4727, + "step": 4995, + "task_loss": 0.7576563954353333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33526724576950073, + "epoch": 4.22, + "learning_rate": 1.9834561043352252e-05, + "loss": 0.5027, + "step": 4996, + "task_loss": 0.5246338844299316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5494797825813293, + "epoch": 4.22, + "learning_rate": 1.9828523125226423e-05, + "loss": 0.6421, + "step": 4997, + "task_loss": 0.9772250652313232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5553446412086487, + "epoch": 4.22, + "learning_rate": 1.9822485207100593e-05, + "loss": 0.5829, + "step": 4998, + "task_loss": 0.22669434547424316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4884990453720093, + "epoch": 4.23, + "learning_rate": 1.9816447288974764e-05, + "loss": 0.4299, + "step": 4999, + "task_loss": 0.7669544219970703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47162771224975586, + "epoch": 4.23, + "learning_rate": 1.981040937084893e-05, + "loss": 0.5348, + "step": 5000, + "task_loss": 0.9524763226509094 + }, + { + "epoch": 4.23, + "eval_accuracy": 0.9006336633663367, + "eval_loss": 0.3424574136734009, + "eval_runtime": 227.1205, + "eval_samples_per_second": 111.174, + "eval_steps_per_second": 0.872, + "step": 5000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.532374382019043, + "epoch": 4.23, + "learning_rate": 1.9804371452723102e-05, + "loss": 0.5458, + "step": 5001, + "task_loss": 0.7421075105667114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3753403425216675, + "epoch": 4.23, + "learning_rate": 1.9798333534597272e-05, + "loss": 0.6103, + "step": 5002, + "task_loss": 0.24701862037181854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2989429831504822, + "epoch": 4.23, + "learning_rate": 1.9792295616471443e-05, + "loss": 0.4956, + "step": 5003, + "task_loss": 0.4482909142971039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.622974693775177, + "epoch": 4.23, + "learning_rate": 1.978625769834561e-05, + "loss": 0.5782, + "step": 5004, + "task_loss": 0.6856787800788879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.312394380569458, + "epoch": 4.23, + "learning_rate": 1.978021978021978e-05, + "loss": 0.3891, + "step": 5005, + "task_loss": 0.27638155221939087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5630203485488892, + "epoch": 4.23, + "learning_rate": 1.977418186209395e-05, + "loss": 0.5441, + "step": 5006, + "task_loss": 0.21401074528694153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4234447777271271, + "epoch": 4.23, + "learning_rate": 1.9768143943968122e-05, + "loss": 0.3912, + "step": 5007, + "task_loss": 0.7603311538696289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3590165972709656, + "epoch": 4.23, + "learning_rate": 1.9762106025842292e-05, + "loss": 0.4712, + "step": 5008, + "task_loss": 0.07598962634801865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38335761427879333, + "epoch": 4.23, + "learning_rate": 1.975606810771646e-05, + "loss": 0.623, + "step": 5009, + "task_loss": 0.23930713534355164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5075116753578186, + "epoch": 4.23, + "learning_rate": 1.975003018959063e-05, + "loss": 0.5581, + "step": 5010, + "task_loss": 0.7713028192520142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4468742311000824, + "epoch": 4.24, + "learning_rate": 1.97439922714648e-05, + "loss": 0.426, + "step": 5011, + "task_loss": 0.6361373662948608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42873284220695496, + "epoch": 4.24, + "learning_rate": 1.9737954353338968e-05, + "loss": 0.4742, + "step": 5012, + "task_loss": 0.8471259474754333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3435823917388916, + "epoch": 4.24, + "learning_rate": 1.9731916435213142e-05, + "loss": 0.4364, + "step": 5013, + "task_loss": 0.5504423379898071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47085511684417725, + "epoch": 4.24, + "learning_rate": 1.972587851708731e-05, + "loss": 0.5782, + "step": 5014, + "task_loss": 0.5790916681289673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37175899744033813, + "epoch": 4.24, + "learning_rate": 1.971984059896148e-05, + "loss": 0.4701, + "step": 5015, + "task_loss": 0.3965161442756653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3714584410190582, + "epoch": 4.24, + "learning_rate": 1.971380268083565e-05, + "loss": 0.5646, + "step": 5016, + "task_loss": 0.4959770143032074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3377198576927185, + "epoch": 4.24, + "learning_rate": 1.9707764762709817e-05, + "loss": 0.5466, + "step": 5017, + "task_loss": 1.0765025615692139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3954613208770752, + "epoch": 4.24, + "learning_rate": 1.970172684458399e-05, + "loss": 0.5697, + "step": 5018, + "task_loss": 0.3766115605831146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6157458424568176, + "epoch": 4.24, + "learning_rate": 1.969568892645816e-05, + "loss": 0.5446, + "step": 5019, + "task_loss": 1.2350085973739624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5992120504379272, + "epoch": 4.24, + "learning_rate": 1.9689651008332326e-05, + "loss": 0.412, + "step": 5020, + "task_loss": 0.3953227698802948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32933223247528076, + "epoch": 4.24, + "learning_rate": 1.96836130902065e-05, + "loss": 0.4416, + "step": 5021, + "task_loss": 0.45828381180763245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.405678391456604, + "epoch": 4.24, + "learning_rate": 1.9677575172080667e-05, + "loss": 0.5131, + "step": 5022, + "task_loss": 1.3066145181655884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45870906114578247, + "epoch": 4.25, + "learning_rate": 1.9671537253954837e-05, + "loss": 0.343, + "step": 5023, + "task_loss": 0.49534887075424194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36197078227996826, + "epoch": 4.25, + "learning_rate": 1.9665499335829008e-05, + "loss": 0.5431, + "step": 5024, + "task_loss": 0.37182527780532837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5389375686645508, + "epoch": 4.25, + "learning_rate": 1.9659461417703175e-05, + "loss": 0.5633, + "step": 5025, + "task_loss": 0.6920986175537109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6336544752120972, + "epoch": 4.25, + "learning_rate": 1.965342349957735e-05, + "loss": 0.6008, + "step": 5026, + "task_loss": 0.3556116819381714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5226563215255737, + "epoch": 4.25, + "learning_rate": 1.9647385581451516e-05, + "loss": 0.577, + "step": 5027, + "task_loss": 0.35030683875083923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5266939401626587, + "epoch": 4.25, + "learning_rate": 1.9641347663325683e-05, + "loss": 0.466, + "step": 5028, + "task_loss": 0.7201595306396484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.635701060295105, + "epoch": 4.25, + "learning_rate": 1.9635309745199857e-05, + "loss": 0.5099, + "step": 5029, + "task_loss": 1.4238333702087402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42528796195983887, + "epoch": 4.25, + "learning_rate": 1.9629271827074025e-05, + "loss": 0.4293, + "step": 5030, + "task_loss": 0.8699221611022949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5875149965286255, + "epoch": 4.25, + "learning_rate": 1.9623233908948195e-05, + "loss": 0.6672, + "step": 5031, + "task_loss": 0.960726261138916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3001979887485504, + "epoch": 4.25, + "learning_rate": 1.9617195990822366e-05, + "loss": 0.4146, + "step": 5032, + "task_loss": 0.08719417452812195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4166693389415741, + "epoch": 4.25, + "learning_rate": 1.9611158072696533e-05, + "loss": 0.4172, + "step": 5033, + "task_loss": 0.7476599216461182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6274157762527466, + "epoch": 4.26, + "learning_rate": 1.9605120154570707e-05, + "loss": 0.501, + "step": 5034, + "task_loss": 0.7170937657356262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22525671124458313, + "epoch": 4.26, + "learning_rate": 1.9599082236444874e-05, + "loss": 0.5107, + "step": 5035, + "task_loss": 0.3120877742767334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4647220969200134, + "epoch": 4.26, + "learning_rate": 1.9593044318319045e-05, + "loss": 0.4565, + "step": 5036, + "task_loss": 0.9033588171005249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.2947896718978882, + "epoch": 4.26, + "learning_rate": 1.9587006400193215e-05, + "loss": 0.8977, + "step": 5037, + "task_loss": 0.5642430782318115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6388611793518066, + "epoch": 4.26, + "learning_rate": 1.9580968482067382e-05, + "loss": 0.602, + "step": 5038, + "task_loss": 0.47370365262031555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5020397305488586, + "epoch": 4.26, + "learning_rate": 1.9574930563941553e-05, + "loss": 0.6733, + "step": 5039, + "task_loss": 0.6022677421569824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4384838044643402, + "epoch": 4.26, + "learning_rate": 1.9568892645815723e-05, + "loss": 0.4635, + "step": 5040, + "task_loss": 0.3759649693965912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5018907189369202, + "epoch": 4.26, + "learning_rate": 1.9562854727689894e-05, + "loss": 0.3896, + "step": 5041, + "task_loss": 0.36943864822387695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38962697982788086, + "epoch": 4.26, + "learning_rate": 1.9556816809564065e-05, + "loss": 0.4285, + "step": 5042, + "task_loss": 0.556939959526062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2982959449291229, + "epoch": 4.26, + "learning_rate": 1.9550778891438232e-05, + "loss": 0.4348, + "step": 5043, + "task_loss": 0.3856349587440491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8032476305961609, + "epoch": 4.26, + "learning_rate": 1.9544740973312402e-05, + "loss": 0.5482, + "step": 5044, + "task_loss": 0.25123676657676697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4885512888431549, + "epoch": 4.26, + "learning_rate": 1.9538703055186573e-05, + "loss": 0.4219, + "step": 5045, + "task_loss": 0.24022117257118225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4221011698246002, + "epoch": 4.27, + "learning_rate": 1.9532665137060744e-05, + "loss": 0.3614, + "step": 5046, + "task_loss": 0.7391841411590576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4160241186618805, + "epoch": 4.27, + "learning_rate": 1.952662721893491e-05, + "loss": 0.5984, + "step": 5047, + "task_loss": 0.40984421968460083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5365934371948242, + "epoch": 4.27, + "learning_rate": 1.952058930080908e-05, + "loss": 0.5665, + "step": 5048, + "task_loss": 0.5471963882446289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7437394261360168, + "epoch": 4.27, + "learning_rate": 1.9514551382683252e-05, + "loss": 0.5564, + "step": 5049, + "task_loss": 0.6211931109428406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42773520946502686, + "epoch": 4.27, + "learning_rate": 1.9508513464557422e-05, + "loss": 0.3929, + "step": 5050, + "task_loss": 0.8035658597946167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39322546124458313, + "epoch": 4.27, + "learning_rate": 1.9502475546431593e-05, + "loss": 0.6138, + "step": 5051, + "task_loss": 0.703641414642334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5219629406929016, + "epoch": 4.27, + "learning_rate": 1.949643762830576e-05, + "loss": 0.6015, + "step": 5052, + "task_loss": 0.25942450761795044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6487494707107544, + "epoch": 4.27, + "learning_rate": 1.949039971017993e-05, + "loss": 0.6842, + "step": 5053, + "task_loss": 0.9145632386207581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7413905262947083, + "epoch": 4.27, + "learning_rate": 1.94843617920541e-05, + "loss": 0.5048, + "step": 5054, + "task_loss": 1.3290339708328247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7212687134742737, + "epoch": 4.27, + "learning_rate": 1.947832387392827e-05, + "loss": 0.6062, + "step": 5055, + "task_loss": 0.7027802467346191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8545883893966675, + "epoch": 4.27, + "learning_rate": 1.9472285955802442e-05, + "loss": 0.5267, + "step": 5056, + "task_loss": 1.0202313661575317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0464457273483276, + "epoch": 4.27, + "learning_rate": 1.946624803767661e-05, + "loss": 0.6722, + "step": 5057, + "task_loss": 1.4713572263717651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4243375360965729, + "epoch": 4.28, + "learning_rate": 1.946021011955078e-05, + "loss": 0.4315, + "step": 5058, + "task_loss": 0.29880428314208984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6840518116950989, + "epoch": 4.28, + "learning_rate": 1.945417220142495e-05, + "loss": 0.7085, + "step": 5059, + "task_loss": 0.4467519223690033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6907805800437927, + "epoch": 4.28, + "learning_rate": 1.9448134283299118e-05, + "loss": 0.5925, + "step": 5060, + "task_loss": 0.8569194674491882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3326148986816406, + "epoch": 4.28, + "learning_rate": 1.944209636517329e-05, + "loss": 0.5376, + "step": 5061, + "task_loss": 0.3716012239456177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3155145049095154, + "epoch": 4.28, + "learning_rate": 1.943605844704746e-05, + "loss": 0.5509, + "step": 5062, + "task_loss": 0.9305038452148438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44722580909729004, + "epoch": 4.28, + "learning_rate": 1.9430020528921626e-05, + "loss": 0.8844, + "step": 5063, + "task_loss": 0.3477742373943329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6165612936019897, + "epoch": 4.28, + "learning_rate": 1.94239826107958e-05, + "loss": 0.5751, + "step": 5064, + "task_loss": 0.6828938126564026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5767428278923035, + "epoch": 4.28, + "learning_rate": 1.9417944692669967e-05, + "loss": 0.516, + "step": 5065, + "task_loss": 0.46416810154914856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4867111146450043, + "epoch": 4.28, + "learning_rate": 1.9411906774544138e-05, + "loss": 0.5893, + "step": 5066, + "task_loss": 0.9872850179672241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4123658835887909, + "epoch": 4.28, + "learning_rate": 1.940586885641831e-05, + "loss": 0.4523, + "step": 5067, + "task_loss": 0.30360570549964905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5846825838088989, + "epoch": 4.28, + "learning_rate": 1.9399830938292476e-05, + "loss": 0.5028, + "step": 5068, + "task_loss": 1.1648516654968262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6611562967300415, + "epoch": 4.28, + "learning_rate": 1.9393793020166646e-05, + "loss": 0.5424, + "step": 5069, + "task_loss": 0.8596183061599731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7998292446136475, + "epoch": 4.29, + "learning_rate": 1.9387755102040817e-05, + "loss": 0.6471, + "step": 5070, + "task_loss": 1.4393681287765503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35797637701034546, + "epoch": 4.29, + "learning_rate": 1.9381717183914987e-05, + "loss": 0.3737, + "step": 5071, + "task_loss": 0.06246805191040039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.530833899974823, + "epoch": 4.29, + "learning_rate": 1.9375679265789158e-05, + "loss": 0.5295, + "step": 5072, + "task_loss": 0.5695270895957947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.736621081829071, + "epoch": 4.29, + "learning_rate": 1.9369641347663325e-05, + "loss": 0.8477, + "step": 5073, + "task_loss": 0.6966522932052612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7256484031677246, + "epoch": 4.29, + "learning_rate": 1.9363603429537496e-05, + "loss": 0.5734, + "step": 5074, + "task_loss": 0.4963006377220154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6662230491638184, + "epoch": 4.29, + "learning_rate": 1.9357565511411666e-05, + "loss": 0.5937, + "step": 5075, + "task_loss": 0.9909964203834534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.364436537027359, + "epoch": 4.29, + "learning_rate": 1.9351527593285837e-05, + "loss": 0.4042, + "step": 5076, + "task_loss": 0.25013086199760437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.541681170463562, + "epoch": 4.29, + "learning_rate": 1.9345489675160004e-05, + "loss": 0.5675, + "step": 5077, + "task_loss": 0.5219549536705017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29979416728019714, + "epoch": 4.29, + "learning_rate": 1.9339451757034175e-05, + "loss": 0.3981, + "step": 5078, + "task_loss": 0.6289870142936707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7802372574806213, + "epoch": 4.29, + "learning_rate": 1.9333413838908345e-05, + "loss": 0.6249, + "step": 5079, + "task_loss": 0.9983679056167603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3775298297405243, + "epoch": 4.29, + "learning_rate": 1.9327375920782516e-05, + "loss": 0.4229, + "step": 5080, + "task_loss": 0.46403446793556213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4789639711380005, + "epoch": 4.29, + "learning_rate": 1.9321338002656686e-05, + "loss": 0.4558, + "step": 5081, + "task_loss": 0.7437804341316223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3236818015575409, + "epoch": 4.3, + "learning_rate": 1.9315300084530854e-05, + "loss": 0.4585, + "step": 5082, + "task_loss": 0.43122851848602295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.415682315826416, + "epoch": 4.3, + "learning_rate": 1.9309262166405024e-05, + "loss": 0.6093, + "step": 5083, + "task_loss": 0.6977458596229553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47659558057785034, + "epoch": 4.3, + "learning_rate": 1.9303224248279195e-05, + "loss": 0.5115, + "step": 5084, + "task_loss": 0.8346410393714905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.14233285188674927, + "epoch": 4.3, + "learning_rate": 1.9297186330153362e-05, + "loss": 0.345, + "step": 5085, + "task_loss": 0.01219052542001009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7001383304595947, + "epoch": 4.3, + "learning_rate": 1.9291148412027536e-05, + "loss": 0.5325, + "step": 5086, + "task_loss": 0.373574435710907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7093246579170227, + "epoch": 4.3, + "learning_rate": 1.9285110493901703e-05, + "loss": 0.6403, + "step": 5087, + "task_loss": 1.1015516519546509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30475446581840515, + "epoch": 4.3, + "learning_rate": 1.9279072575775874e-05, + "loss": 0.4312, + "step": 5088, + "task_loss": 0.6931011080741882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5989788174629211, + "epoch": 4.3, + "learning_rate": 1.9273034657650044e-05, + "loss": 0.5142, + "step": 5089, + "task_loss": 0.5983972549438477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4142167568206787, + "epoch": 4.3, + "learning_rate": 1.926699673952421e-05, + "loss": 0.4006, + "step": 5090, + "task_loss": 0.6432098150253296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45670264959335327, + "epoch": 4.3, + "learning_rate": 1.9260958821398385e-05, + "loss": 0.4902, + "step": 5091, + "task_loss": 0.664253830909729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37471523880958557, + "epoch": 4.3, + "learning_rate": 1.9254920903272553e-05, + "loss": 0.5366, + "step": 5092, + "task_loss": 0.2518182098865509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5968806743621826, + "epoch": 4.3, + "learning_rate": 1.924888298514672e-05, + "loss": 0.5366, + "step": 5093, + "task_loss": 1.1582841873168945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5674070715904236, + "epoch": 4.31, + "learning_rate": 1.9242845067020894e-05, + "loss": 0.4621, + "step": 5094, + "task_loss": 0.767611026763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47268348932266235, + "epoch": 4.31, + "learning_rate": 1.923680714889506e-05, + "loss": 0.4878, + "step": 5095, + "task_loss": 0.47463980317115784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35396715998649597, + "epoch": 4.31, + "learning_rate": 1.923076923076923e-05, + "loss": 0.4539, + "step": 5096, + "task_loss": 0.9898560047149658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40556108951568604, + "epoch": 4.31, + "learning_rate": 1.9224731312643402e-05, + "loss": 0.3564, + "step": 5097, + "task_loss": 0.06661086529493332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5862468481063843, + "epoch": 4.31, + "learning_rate": 1.921869339451757e-05, + "loss": 0.689, + "step": 5098, + "task_loss": 0.9265633821487427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4073794484138489, + "epoch": 4.31, + "learning_rate": 1.9212655476391743e-05, + "loss": 0.6769, + "step": 5099, + "task_loss": 0.9757845401763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3788613975048065, + "epoch": 4.31, + "learning_rate": 1.920661755826591e-05, + "loss": 0.5227, + "step": 5100, + "task_loss": 1.009295105934143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5585248470306396, + "epoch": 4.31, + "learning_rate": 1.920057964014008e-05, + "loss": 0.597, + "step": 5101, + "task_loss": 1.3039665222167969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4124269485473633, + "epoch": 4.31, + "learning_rate": 1.919454172201425e-05, + "loss": 0.4038, + "step": 5102, + "task_loss": 0.18576285243034363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5707939863204956, + "epoch": 4.31, + "learning_rate": 1.918850380388842e-05, + "loss": 0.6589, + "step": 5103, + "task_loss": 1.527671217918396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3180033564567566, + "epoch": 4.31, + "learning_rate": 1.918246588576259e-05, + "loss": 0.5257, + "step": 5104, + "task_loss": 0.6055582165718079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6053884625434875, + "epoch": 4.32, + "learning_rate": 1.917642796763676e-05, + "loss": 0.544, + "step": 5105, + "task_loss": 0.9133829474449158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5316344499588013, + "epoch": 4.32, + "learning_rate": 1.917039004951093e-05, + "loss": 0.4938, + "step": 5106, + "task_loss": 1.6233495473861694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5007742047309875, + "epoch": 4.32, + "learning_rate": 1.91643521313851e-05, + "loss": 0.4212, + "step": 5107, + "task_loss": 0.6284265518188477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2910877764225006, + "epoch": 4.32, + "learning_rate": 1.9158314213259268e-05, + "loss": 0.3779, + "step": 5108, + "task_loss": 0.7947986125946045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8732811212539673, + "epoch": 4.32, + "learning_rate": 1.915227629513344e-05, + "loss": 0.5189, + "step": 5109, + "task_loss": 0.6603541374206543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.00429105758667, + "epoch": 4.32, + "learning_rate": 1.914623837700761e-05, + "loss": 0.5992, + "step": 5110, + "task_loss": 0.8881577849388123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6181060075759888, + "epoch": 4.32, + "learning_rate": 1.914020045888178e-05, + "loss": 0.497, + "step": 5111, + "task_loss": 1.317571997642517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6937109231948853, + "epoch": 4.32, + "learning_rate": 1.9134162540755947e-05, + "loss": 0.489, + "step": 5112, + "task_loss": 0.6713109016418457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7358061671257019, + "epoch": 4.32, + "learning_rate": 1.9128124622630118e-05, + "loss": 0.5367, + "step": 5113, + "task_loss": 0.7201684713363647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6936349868774414, + "epoch": 4.32, + "learning_rate": 1.9122086704504288e-05, + "loss": 0.5367, + "step": 5114, + "task_loss": 0.6637175679206848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6068978309631348, + "epoch": 4.32, + "learning_rate": 1.911604878637846e-05, + "loss": 0.5031, + "step": 5115, + "task_loss": 0.26968470215797424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40736767649650574, + "epoch": 4.32, + "learning_rate": 1.911001086825263e-05, + "loss": 0.5261, + "step": 5116, + "task_loss": 0.5758113861083984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4408773183822632, + "epoch": 4.33, + "learning_rate": 1.9103972950126796e-05, + "loss": 0.4445, + "step": 5117, + "task_loss": 0.4235752820968628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44996312260627747, + "epoch": 4.33, + "learning_rate": 1.9097935032000967e-05, + "loss": 0.4421, + "step": 5118, + "task_loss": 0.4323626756668091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6320191025733948, + "epoch": 4.33, + "learning_rate": 1.9091897113875138e-05, + "loss": 0.6713, + "step": 5119, + "task_loss": 0.23968547582626343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6504097580909729, + "epoch": 4.33, + "learning_rate": 1.9085859195749305e-05, + "loss": 0.5741, + "step": 5120, + "task_loss": 0.6169661283493042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4878043234348297, + "epoch": 4.33, + "learning_rate": 1.907982127762348e-05, + "loss": 0.4876, + "step": 5121, + "task_loss": 0.863811731338501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46960264444351196, + "epoch": 4.33, + "learning_rate": 1.9073783359497646e-05, + "loss": 0.568, + "step": 5122, + "task_loss": 1.1279407739639282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7212247252464294, + "epoch": 4.33, + "learning_rate": 1.9067745441371817e-05, + "loss": 0.5578, + "step": 5123, + "task_loss": 1.0002729892730713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5766572952270508, + "epoch": 4.33, + "learning_rate": 1.9061707523245987e-05, + "loss": 0.3965, + "step": 5124, + "task_loss": 0.5978819131851196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5018870830535889, + "epoch": 4.33, + "learning_rate": 1.9055669605120154e-05, + "loss": 0.4307, + "step": 5125, + "task_loss": 0.843283474445343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43089205026626587, + "epoch": 4.33, + "learning_rate": 1.9049631686994325e-05, + "loss": 0.4732, + "step": 5126, + "task_loss": 1.351424217224121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.525310754776001, + "epoch": 4.33, + "learning_rate": 1.9043593768868495e-05, + "loss": 0.5845, + "step": 5127, + "task_loss": 0.8180626034736633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5770017504692078, + "epoch": 4.33, + "learning_rate": 1.9037555850742663e-05, + "loss": 0.5818, + "step": 5128, + "task_loss": 1.3524112701416016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8728530406951904, + "epoch": 4.34, + "learning_rate": 1.9031517932616837e-05, + "loss": 0.5867, + "step": 5129, + "task_loss": 0.9379435181617737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30259817838668823, + "epoch": 4.34, + "learning_rate": 1.9025480014491004e-05, + "loss": 0.4369, + "step": 5130, + "task_loss": 0.5340809226036072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32282328605651855, + "epoch": 4.34, + "learning_rate": 1.9019442096365174e-05, + "loss": 0.4358, + "step": 5131, + "task_loss": 0.8354558944702148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5996333956718445, + "epoch": 4.34, + "learning_rate": 1.9013404178239345e-05, + "loss": 0.5105, + "step": 5132, + "task_loss": 1.0537067651748657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3505741357803345, + "epoch": 4.34, + "learning_rate": 1.9007366260113512e-05, + "loss": 0.4788, + "step": 5133, + "task_loss": 0.8713359832763672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4501539468765259, + "epoch": 4.34, + "learning_rate": 1.9001328341987683e-05, + "loss": 0.6639, + "step": 5134, + "task_loss": 0.597876787185669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44056135416030884, + "epoch": 4.34, + "learning_rate": 1.8995290423861853e-05, + "loss": 0.6057, + "step": 5135, + "task_loss": 0.3981756269931793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29598724842071533, + "epoch": 4.34, + "learning_rate": 1.8989252505736024e-05, + "loss": 0.3804, + "step": 5136, + "task_loss": 0.1258758306503296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4103579521179199, + "epoch": 4.34, + "learning_rate": 1.8983214587610194e-05, + "loss": 0.4197, + "step": 5137, + "task_loss": 0.6493760943412781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.632029116153717, + "epoch": 4.34, + "learning_rate": 1.897717666948436e-05, + "loss": 0.5157, + "step": 5138, + "task_loss": 0.593542754650116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3973308205604553, + "epoch": 4.34, + "learning_rate": 1.8971138751358532e-05, + "loss": 0.5195, + "step": 5139, + "task_loss": 0.42211097478866577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2950882911682129, + "epoch": 4.34, + "learning_rate": 1.8965100833232703e-05, + "loss": 0.3578, + "step": 5140, + "task_loss": 0.1372232586145401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.676523745059967, + "epoch": 4.35, + "learning_rate": 1.8959062915106873e-05, + "loss": 0.6681, + "step": 5141, + "task_loss": 0.6642469763755798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5017623901367188, + "epoch": 4.35, + "learning_rate": 1.895302499698104e-05, + "loss": 0.413, + "step": 5142, + "task_loss": 0.8043433427810669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34978625178337097, + "epoch": 4.35, + "learning_rate": 1.894698707885521e-05, + "loss": 0.3961, + "step": 5143, + "task_loss": 0.552386462688446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2580888867378235, + "epoch": 4.35, + "learning_rate": 1.894094916072938e-05, + "loss": 0.4161, + "step": 5144, + "task_loss": 0.0676063820719719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.367387056350708, + "epoch": 4.35, + "learning_rate": 1.8934911242603552e-05, + "loss": 0.4663, + "step": 5145, + "task_loss": 0.8344635367393494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7152042984962463, + "epoch": 4.35, + "learning_rate": 1.8928873324477723e-05, + "loss": 0.5356, + "step": 5146, + "task_loss": 0.8455249667167664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3174892067909241, + "epoch": 4.35, + "learning_rate": 1.892283540635189e-05, + "loss": 0.4547, + "step": 5147, + "task_loss": 0.14665673673152924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3568580746650696, + "epoch": 4.35, + "learning_rate": 1.891679748822606e-05, + "loss": 0.5103, + "step": 5148, + "task_loss": 0.40141525864601135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5586023330688477, + "epoch": 4.35, + "learning_rate": 1.891075957010023e-05, + "loss": 0.6699, + "step": 5149, + "task_loss": 0.7082034349441528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7683988213539124, + "epoch": 4.35, + "learning_rate": 1.8904721651974398e-05, + "loss": 0.6165, + "step": 5150, + "task_loss": 0.7438834309577942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5254579782485962, + "epoch": 4.35, + "learning_rate": 1.8898683733848572e-05, + "loss": 0.4535, + "step": 5151, + "task_loss": 0.8049620389938354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.468134343624115, + "epoch": 4.35, + "learning_rate": 1.889264581572274e-05, + "loss": 0.4523, + "step": 5152, + "task_loss": 1.2789943218231201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6025068759918213, + "epoch": 4.36, + "learning_rate": 1.888660789759691e-05, + "loss": 0.605, + "step": 5153, + "task_loss": 1.0537078380584717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.373046875, + "epoch": 4.36, + "learning_rate": 1.888056997947108e-05, + "loss": 0.4383, + "step": 5154, + "task_loss": 1.196982979774475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7268447279930115, + "epoch": 4.36, + "learning_rate": 1.8874532061345248e-05, + "loss": 0.5752, + "step": 5155, + "task_loss": 0.4414595067501068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2773147225379944, + "epoch": 4.36, + "learning_rate": 1.886849414321942e-05, + "loss": 0.531, + "step": 5156, + "task_loss": 1.1413606405258179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9555575847625732, + "epoch": 4.36, + "learning_rate": 1.886245622509359e-05, + "loss": 0.6351, + "step": 5157, + "task_loss": 1.3631845712661743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42526522278785706, + "epoch": 4.36, + "learning_rate": 1.8856418306967756e-05, + "loss": 0.5417, + "step": 5158, + "task_loss": 0.41887953877449036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8114304542541504, + "epoch": 4.36, + "learning_rate": 1.885038038884193e-05, + "loss": 0.587, + "step": 5159, + "task_loss": 0.6104749441146851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30991843342781067, + "epoch": 4.36, + "learning_rate": 1.8844342470716097e-05, + "loss": 0.6882, + "step": 5160, + "task_loss": 0.659593403339386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.637170135974884, + "epoch": 4.36, + "learning_rate": 1.8838304552590268e-05, + "loss": 0.6361, + "step": 5161, + "task_loss": 0.5559813976287842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26045629382133484, + "epoch": 4.36, + "learning_rate": 1.8832266634464438e-05, + "loss": 0.38, + "step": 5162, + "task_loss": 0.0815131887793541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34749385714530945, + "epoch": 4.36, + "learning_rate": 1.8826228716338605e-05, + "loss": 0.539, + "step": 5163, + "task_loss": 0.39933592081069946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.727149248123169, + "epoch": 4.36, + "learning_rate": 1.882019079821278e-05, + "loss": 0.6543, + "step": 5164, + "task_loss": 1.1992418766021729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3597978949546814, + "epoch": 4.37, + "learning_rate": 1.8814152880086947e-05, + "loss": 0.5455, + "step": 5165, + "task_loss": 1.1979615688323975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6934961080551147, + "epoch": 4.37, + "learning_rate": 1.8808114961961117e-05, + "loss": 0.5409, + "step": 5166, + "task_loss": 1.4606928825378418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6971839666366577, + "epoch": 4.37, + "learning_rate": 1.8802077043835288e-05, + "loss": 0.5805, + "step": 5167, + "task_loss": 1.1718096733093262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3281139135360718, + "epoch": 4.37, + "learning_rate": 1.8796039125709455e-05, + "loss": 0.3893, + "step": 5168, + "task_loss": 0.43466925621032715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.589095950126648, + "epoch": 4.37, + "learning_rate": 1.8790001207583626e-05, + "loss": 0.4319, + "step": 5169, + "task_loss": 0.7735646367073059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5194735527038574, + "epoch": 4.37, + "learning_rate": 1.8783963289457796e-05, + "loss": 0.4677, + "step": 5170, + "task_loss": 0.622076153755188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6105556488037109, + "epoch": 4.37, + "learning_rate": 1.8777925371331967e-05, + "loss": 0.6896, + "step": 5171, + "task_loss": 1.3013780117034912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.576444685459137, + "epoch": 4.37, + "learning_rate": 1.8771887453206137e-05, + "loss": 0.3812, + "step": 5172, + "task_loss": 0.7041521668434143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6930994391441345, + "epoch": 4.37, + "learning_rate": 1.8765849535080304e-05, + "loss": 0.5149, + "step": 5173, + "task_loss": 0.5136037468910217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32187363505363464, + "epoch": 4.37, + "learning_rate": 1.8759811616954475e-05, + "loss": 0.5509, + "step": 5174, + "task_loss": 0.19670556485652924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.63991379737854, + "epoch": 4.37, + "learning_rate": 1.8753773698828646e-05, + "loss": 0.4788, + "step": 5175, + "task_loss": 1.519219160079956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4820733070373535, + "epoch": 4.38, + "learning_rate": 1.8747735780702816e-05, + "loss": 0.4932, + "step": 5176, + "task_loss": 1.5110890865325928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6874973177909851, + "epoch": 4.38, + "learning_rate": 1.8741697862576983e-05, + "loss": 0.5824, + "step": 5177, + "task_loss": 0.21568679809570312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6455018520355225, + "epoch": 4.38, + "learning_rate": 1.8735659944451154e-05, + "loss": 0.5614, + "step": 5178, + "task_loss": 0.8208565711975098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37172046303749084, + "epoch": 4.38, + "learning_rate": 1.8729622026325324e-05, + "loss": 0.4298, + "step": 5179, + "task_loss": 0.4251770079135895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35570043325424194, + "epoch": 4.38, + "learning_rate": 1.8723584108199495e-05, + "loss": 0.4928, + "step": 5180, + "task_loss": 0.6891517639160156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4183785319328308, + "epoch": 4.38, + "learning_rate": 1.8717546190073666e-05, + "loss": 0.502, + "step": 5181, + "task_loss": 0.35888147354125977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44515299797058105, + "epoch": 4.38, + "learning_rate": 1.8711508271947833e-05, + "loss": 0.5921, + "step": 5182, + "task_loss": 1.1373136043548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5336998701095581, + "epoch": 4.38, + "learning_rate": 1.8705470353822003e-05, + "loss": 0.6191, + "step": 5183, + "task_loss": 0.805208146572113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7903681397438049, + "epoch": 4.38, + "learning_rate": 1.8699432435696174e-05, + "loss": 0.7983, + "step": 5184, + "task_loss": 0.9237998723983765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3902924358844757, + "epoch": 4.38, + "learning_rate": 1.869339451757034e-05, + "loss": 0.4596, + "step": 5185, + "task_loss": 0.2815991938114166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6060531139373779, + "epoch": 4.38, + "learning_rate": 1.8687356599444515e-05, + "loss": 0.4206, + "step": 5186, + "task_loss": 0.7269940972328186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3886060118675232, + "epoch": 4.38, + "learning_rate": 1.8681318681318682e-05, + "loss": 0.4631, + "step": 5187, + "task_loss": 1.064745306968689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.741595983505249, + "epoch": 4.39, + "learning_rate": 1.8675280763192853e-05, + "loss": 0.5722, + "step": 5188, + "task_loss": 0.7561153173446655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5882607698440552, + "epoch": 4.39, + "learning_rate": 1.8669242845067023e-05, + "loss": 0.5243, + "step": 5189, + "task_loss": 0.12406136095523834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7448962926864624, + "epoch": 4.39, + "learning_rate": 1.866320492694119e-05, + "loss": 0.6412, + "step": 5190, + "task_loss": 0.9993990063667297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25048762559890747, + "epoch": 4.39, + "learning_rate": 1.865716700881536e-05, + "loss": 0.3871, + "step": 5191, + "task_loss": 0.2131929248571396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46440568566322327, + "epoch": 4.39, + "learning_rate": 1.8651129090689532e-05, + "loss": 0.4771, + "step": 5192, + "task_loss": 1.067097544670105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4386768937110901, + "epoch": 4.39, + "learning_rate": 1.86450911725637e-05, + "loss": 0.4248, + "step": 5193, + "task_loss": 0.8405083417892456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40268099308013916, + "epoch": 4.39, + "learning_rate": 1.8639053254437873e-05, + "loss": 0.4916, + "step": 5194, + "task_loss": 0.8314642906188965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30733591318130493, + "epoch": 4.39, + "learning_rate": 1.863301533631204e-05, + "loss": 0.3983, + "step": 5195, + "task_loss": 0.04206152260303497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5220547914505005, + "epoch": 4.39, + "learning_rate": 1.862697741818621e-05, + "loss": 0.718, + "step": 5196, + "task_loss": 0.27213263511657715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.522191047668457, + "epoch": 4.39, + "learning_rate": 1.862093950006038e-05, + "loss": 0.3941, + "step": 5197, + "task_loss": 0.5888574719429016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5882545113563538, + "epoch": 4.39, + "learning_rate": 1.861490158193455e-05, + "loss": 0.5459, + "step": 5198, + "task_loss": 0.6874248385429382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7507486343383789, + "epoch": 4.39, + "learning_rate": 1.860886366380872e-05, + "loss": 0.6069, + "step": 5199, + "task_loss": 1.7811335325241089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6033719182014465, + "epoch": 4.4, + "learning_rate": 1.860282574568289e-05, + "loss": 0.5418, + "step": 5200, + "task_loss": 0.39259931445121765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35461854934692383, + "epoch": 4.4, + "learning_rate": 1.8596787827557057e-05, + "loss": 0.4779, + "step": 5201, + "task_loss": 1.1613352298736572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5100465416908264, + "epoch": 4.4, + "learning_rate": 1.859074990943123e-05, + "loss": 0.4774, + "step": 5202, + "task_loss": 0.6522606611251831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3180009424686432, + "epoch": 4.4, + "learning_rate": 1.8584711991305398e-05, + "loss": 0.3993, + "step": 5203, + "task_loss": 0.7879483699798584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7164030075073242, + "epoch": 4.4, + "learning_rate": 1.857867407317957e-05, + "loss": 0.6032, + "step": 5204, + "task_loss": 1.008669376373291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6012774705886841, + "epoch": 4.4, + "learning_rate": 1.857263615505374e-05, + "loss": 0.6541, + "step": 5205, + "task_loss": 0.9772782325744629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35195499658584595, + "epoch": 4.4, + "learning_rate": 1.8566598236927906e-05, + "loss": 0.4864, + "step": 5206, + "task_loss": 0.39875492453575134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47787153720855713, + "epoch": 4.4, + "learning_rate": 1.8560560318802077e-05, + "loss": 0.4436, + "step": 5207, + "task_loss": 0.7053685784339905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6070903539657593, + "epoch": 4.4, + "learning_rate": 1.8554522400676247e-05, + "loss": 0.47, + "step": 5208, + "task_loss": 0.9445880651473999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7124163508415222, + "epoch": 4.4, + "learning_rate": 1.8548484482550418e-05, + "loss": 0.4772, + "step": 5209, + "task_loss": 0.5998092293739319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5284239649772644, + "epoch": 4.4, + "learning_rate": 1.854244656442459e-05, + "loss": 0.6413, + "step": 5210, + "task_loss": 0.7293675541877747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4566672742366791, + "epoch": 4.4, + "learning_rate": 1.8536408646298756e-05, + "loss": 0.4767, + "step": 5211, + "task_loss": 0.40886664390563965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4627583622932434, + "epoch": 4.41, + "learning_rate": 1.8530370728172926e-05, + "loss": 0.4265, + "step": 5212, + "task_loss": 1.441691517829895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3747980296611786, + "epoch": 4.41, + "learning_rate": 1.8524332810047097e-05, + "loss": 0.4517, + "step": 5213, + "task_loss": 0.6199540495872498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24329079687595367, + "epoch": 4.41, + "learning_rate": 1.8518294891921267e-05, + "loss": 0.3645, + "step": 5214, + "task_loss": 0.30233871936798096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4385206997394562, + "epoch": 4.41, + "learning_rate": 1.8512256973795435e-05, + "loss": 0.4031, + "step": 5215, + "task_loss": 0.8836256265640259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30957403779029846, + "epoch": 4.41, + "learning_rate": 1.8506219055669605e-05, + "loss": 0.5083, + "step": 5216, + "task_loss": 1.136552333831787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5118087530136108, + "epoch": 4.41, + "learning_rate": 1.8500181137543776e-05, + "loss": 0.5646, + "step": 5217, + "task_loss": 2.026247501373291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4487159252166748, + "epoch": 4.41, + "learning_rate": 1.8494143219417946e-05, + "loss": 0.6313, + "step": 5218, + "task_loss": 0.32908979058265686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.837873101234436, + "epoch": 4.41, + "learning_rate": 1.8488105301292117e-05, + "loss": 0.6047, + "step": 5219, + "task_loss": 0.8457399606704712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5687423348426819, + "epoch": 4.41, + "learning_rate": 1.8482067383166284e-05, + "loss": 0.5461, + "step": 5220, + "task_loss": 0.6929583549499512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3987710475921631, + "epoch": 4.41, + "learning_rate": 1.8476029465040455e-05, + "loss": 0.4148, + "step": 5221, + "task_loss": 0.9801090955734253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6041147708892822, + "epoch": 4.41, + "learning_rate": 1.8469991546914625e-05, + "loss": 0.4634, + "step": 5222, + "task_loss": 0.6882253289222717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29924583435058594, + "epoch": 4.41, + "learning_rate": 1.8463953628788792e-05, + "loss": 0.6147, + "step": 5223, + "task_loss": 1.0725750923156738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4224993586540222, + "epoch": 4.42, + "learning_rate": 1.8457915710662966e-05, + "loss": 0.5872, + "step": 5224, + "task_loss": 0.6673390865325928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6655415892601013, + "epoch": 4.42, + "learning_rate": 1.8451877792537133e-05, + "loss": 0.505, + "step": 5225, + "task_loss": 0.2814217805862427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7435063719749451, + "epoch": 4.42, + "learning_rate": 1.8445839874411304e-05, + "loss": 0.5457, + "step": 5226, + "task_loss": 0.8360223174095154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43006569147109985, + "epoch": 4.42, + "learning_rate": 1.8439801956285475e-05, + "loss": 0.4652, + "step": 5227, + "task_loss": 0.8208799958229065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5115324854850769, + "epoch": 4.42, + "learning_rate": 1.8433764038159642e-05, + "loss": 0.5135, + "step": 5228, + "task_loss": 0.6593109965324402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27685004472732544, + "epoch": 4.42, + "learning_rate": 1.8427726120033816e-05, + "loss": 0.395, + "step": 5229, + "task_loss": 0.2624698877334595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.571931004524231, + "epoch": 4.42, + "learning_rate": 1.8421688201907983e-05, + "loss": 0.5069, + "step": 5230, + "task_loss": 0.3697534203529358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5739418864250183, + "epoch": 4.42, + "learning_rate": 1.841565028378215e-05, + "loss": 0.722, + "step": 5231, + "task_loss": 1.02635657787323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2799665331840515, + "epoch": 4.42, + "learning_rate": 1.8409612365656324e-05, + "loss": 0.4128, + "step": 5232, + "task_loss": 0.3750864565372467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5008310079574585, + "epoch": 4.42, + "learning_rate": 1.840357444753049e-05, + "loss": 0.6347, + "step": 5233, + "task_loss": 1.2946327924728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4655146598815918, + "epoch": 4.42, + "learning_rate": 1.8397536529404662e-05, + "loss": 0.4888, + "step": 5234, + "task_loss": 0.3668901026248932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2846829295158386, + "epoch": 4.42, + "learning_rate": 1.8391498611278832e-05, + "loss": 0.5003, + "step": 5235, + "task_loss": 1.02667236328125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3310955762863159, + "epoch": 4.43, + "learning_rate": 1.8385460693153e-05, + "loss": 0.5809, + "step": 5236, + "task_loss": 1.1437448263168335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39515721797943115, + "epoch": 4.43, + "learning_rate": 1.8379422775027174e-05, + "loss": 0.4494, + "step": 5237, + "task_loss": 0.24364520609378815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46504783630371094, + "epoch": 4.43, + "learning_rate": 1.837338485690134e-05, + "loss": 0.6001, + "step": 5238, + "task_loss": 0.8616440892219543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.764136552810669, + "epoch": 4.43, + "learning_rate": 1.836734693877551e-05, + "loss": 0.5972, + "step": 5239, + "task_loss": 0.9303487539291382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.253934383392334, + "epoch": 4.43, + "learning_rate": 1.8361309020649682e-05, + "loss": 0.4531, + "step": 5240, + "task_loss": 0.698129415512085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5719914436340332, + "epoch": 4.43, + "learning_rate": 1.835527110252385e-05, + "loss": 0.5762, + "step": 5241, + "task_loss": 0.8353196978569031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7070803642272949, + "epoch": 4.43, + "learning_rate": 1.834923318439802e-05, + "loss": 0.5063, + "step": 5242, + "task_loss": 0.6174041628837585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5050186514854431, + "epoch": 4.43, + "learning_rate": 1.834319526627219e-05, + "loss": 0.6314, + "step": 5243, + "task_loss": 0.16646148264408112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5740396976470947, + "epoch": 4.43, + "learning_rate": 1.833715734814636e-05, + "loss": 0.4435, + "step": 5244, + "task_loss": 1.227225422859192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3799850046634674, + "epoch": 4.43, + "learning_rate": 1.833111943002053e-05, + "loss": 0.5202, + "step": 5245, + "task_loss": 0.3683261275291443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3593989908695221, + "epoch": 4.43, + "learning_rate": 1.83250815118947e-05, + "loss": 0.4351, + "step": 5246, + "task_loss": 0.8734204769134521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46893110871315, + "epoch": 4.44, + "learning_rate": 1.831904359376887e-05, + "loss": 0.5102, + "step": 5247, + "task_loss": 0.46257284283638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48942768573760986, + "epoch": 4.44, + "learning_rate": 1.831300567564304e-05, + "loss": 0.5186, + "step": 5248, + "task_loss": 0.3115019202232361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5077903270721436, + "epoch": 4.44, + "learning_rate": 1.830696775751721e-05, + "loss": 0.5817, + "step": 5249, + "task_loss": 0.8587189316749573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7223268747329712, + "epoch": 4.44, + "learning_rate": 1.8300929839391377e-05, + "loss": 0.6004, + "step": 5250, + "task_loss": 0.47751355171203613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5748900771141052, + "epoch": 4.44, + "learning_rate": 1.8294891921265548e-05, + "loss": 0.4794, + "step": 5251, + "task_loss": 0.40447402000427246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3141838312149048, + "epoch": 4.44, + "learning_rate": 1.828885400313972e-05, + "loss": 0.4846, + "step": 5252, + "task_loss": 0.9764127731323242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48671069741249084, + "epoch": 4.44, + "learning_rate": 1.828281608501389e-05, + "loss": 0.4711, + "step": 5253, + "task_loss": 0.8839144110679626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5850273370742798, + "epoch": 4.44, + "learning_rate": 1.827677816688806e-05, + "loss": 0.6087, + "step": 5254, + "task_loss": 0.8768379092216492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6182278394699097, + "epoch": 4.44, + "learning_rate": 1.8270740248762227e-05, + "loss": 0.5383, + "step": 5255, + "task_loss": 1.4835466146469116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22329722344875336, + "epoch": 4.44, + "learning_rate": 1.8264702330636397e-05, + "loss": 0.4866, + "step": 5256, + "task_loss": 0.32439878582954407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5534935593605042, + "epoch": 4.44, + "learning_rate": 1.8258664412510568e-05, + "loss": 0.5823, + "step": 5257, + "task_loss": 0.43820783495903015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4369826316833496, + "epoch": 4.44, + "learning_rate": 1.8252626494384735e-05, + "loss": 0.4399, + "step": 5258, + "task_loss": 0.5779245495796204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5637534260749817, + "epoch": 4.45, + "learning_rate": 1.824658857625891e-05, + "loss": 0.5354, + "step": 5259, + "task_loss": 0.7149650454521179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.845821738243103, + "epoch": 4.45, + "learning_rate": 1.8240550658133076e-05, + "loss": 0.5278, + "step": 5260, + "task_loss": 0.46729978919029236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7475347518920898, + "epoch": 4.45, + "learning_rate": 1.8234512740007244e-05, + "loss": 0.5444, + "step": 5261, + "task_loss": 0.76250821352005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6794567108154297, + "epoch": 4.45, + "learning_rate": 1.8228474821881417e-05, + "loss": 0.6284, + "step": 5262, + "task_loss": 0.4331459105014801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3166576623916626, + "epoch": 4.45, + "learning_rate": 1.8222436903755585e-05, + "loss": 0.4491, + "step": 5263, + "task_loss": 0.4386400282382965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30151674151420593, + "epoch": 4.45, + "learning_rate": 1.8216398985629755e-05, + "loss": 0.3812, + "step": 5264, + "task_loss": 0.2955605685710907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.853123664855957, + "epoch": 4.45, + "learning_rate": 1.8210361067503926e-05, + "loss": 0.5466, + "step": 5265, + "task_loss": 0.5371663570404053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5875374674797058, + "epoch": 4.45, + "learning_rate": 1.8204323149378093e-05, + "loss": 0.6483, + "step": 5266, + "task_loss": 0.8730911016464233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4760594964027405, + "epoch": 4.45, + "learning_rate": 1.8198285231252267e-05, + "loss": 0.4716, + "step": 5267, + "task_loss": 1.7761266231536865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.618675947189331, + "epoch": 4.45, + "learning_rate": 1.8192247313126434e-05, + "loss": 0.4724, + "step": 5268, + "task_loss": 0.45525598526000977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34543949365615845, + "epoch": 4.45, + "learning_rate": 1.8186209395000605e-05, + "loss": 0.446, + "step": 5269, + "task_loss": 0.09220592677593231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.598749041557312, + "epoch": 4.45, + "learning_rate": 1.8180171476874775e-05, + "loss": 0.5558, + "step": 5270, + "task_loss": 1.2085145711898804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3262456953525543, + "epoch": 4.46, + "learning_rate": 1.8174133558748942e-05, + "loss": 0.4087, + "step": 5271, + "task_loss": 0.5155856013298035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.363564133644104, + "epoch": 4.46, + "learning_rate": 1.8168095640623113e-05, + "loss": 0.4454, + "step": 5272, + "task_loss": 0.49557217955589294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8112093210220337, + "epoch": 4.46, + "learning_rate": 1.8162057722497284e-05, + "loss": 0.576, + "step": 5273, + "task_loss": 0.7739707231521606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29326656460762024, + "epoch": 4.46, + "learning_rate": 1.8156019804371454e-05, + "loss": 0.484, + "step": 5274, + "task_loss": 0.36506983637809753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4936417043209076, + "epoch": 4.46, + "learning_rate": 1.8149981886245625e-05, + "loss": 0.3591, + "step": 5275, + "task_loss": 0.28767508268356323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6677588820457458, + "epoch": 4.46, + "learning_rate": 1.8143943968119792e-05, + "loss": 0.8006, + "step": 5276, + "task_loss": 0.9888085126876831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4255192279815674, + "epoch": 4.46, + "learning_rate": 1.8137906049993963e-05, + "loss": 0.4322, + "step": 5277, + "task_loss": 0.8949341773986816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3338794410228729, + "epoch": 4.46, + "learning_rate": 1.8131868131868133e-05, + "loss": 0.6109, + "step": 5278, + "task_loss": 0.2139960527420044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4101320803165436, + "epoch": 4.46, + "learning_rate": 1.8125830213742304e-05, + "loss": 0.4643, + "step": 5279, + "task_loss": 0.5544992089271545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5465366244316101, + "epoch": 4.46, + "learning_rate": 1.811979229561647e-05, + "loss": 0.527, + "step": 5280, + "task_loss": 1.4394034147262573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.249984011054039, + "epoch": 4.46, + "learning_rate": 1.811375437749064e-05, + "loss": 0.4388, + "step": 5281, + "task_loss": 0.3796558380126953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6977836489677429, + "epoch": 4.46, + "learning_rate": 1.8107716459364812e-05, + "loss": 0.4727, + "step": 5282, + "task_loss": 1.1670849323272705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3595432639122009, + "epoch": 4.47, + "learning_rate": 1.8101678541238983e-05, + "loss": 0.3912, + "step": 5283, + "task_loss": 0.8764082193374634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43445903062820435, + "epoch": 4.47, + "learning_rate": 1.8095640623113153e-05, + "loss": 0.5729, + "step": 5284, + "task_loss": 0.6454800367355347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3363853096961975, + "epoch": 4.47, + "learning_rate": 1.808960270498732e-05, + "loss": 0.4989, + "step": 5285, + "task_loss": 0.09658601135015488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7544049024581909, + "epoch": 4.47, + "learning_rate": 1.808356478686149e-05, + "loss": 0.6114, + "step": 5286, + "task_loss": 0.6382666826248169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31915730237960815, + "epoch": 4.47, + "learning_rate": 1.807752686873566e-05, + "loss": 0.5325, + "step": 5287, + "task_loss": 0.3222561478614807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39873069524765015, + "epoch": 4.47, + "learning_rate": 1.807148895060983e-05, + "loss": 0.4507, + "step": 5288, + "task_loss": 0.6492705941200256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3667658567428589, + "epoch": 4.47, + "learning_rate": 1.8065451032484003e-05, + "loss": 0.4031, + "step": 5289, + "task_loss": 0.4181379973888397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44841498136520386, + "epoch": 4.47, + "learning_rate": 1.805941311435817e-05, + "loss": 0.5069, + "step": 5290, + "task_loss": 0.25346216559410095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5366259813308716, + "epoch": 4.47, + "learning_rate": 1.805337519623234e-05, + "loss": 0.5799, + "step": 5291, + "task_loss": 0.6457687020301819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4641053080558777, + "epoch": 4.47, + "learning_rate": 1.804733727810651e-05, + "loss": 0.4812, + "step": 5292, + "task_loss": 0.3414473831653595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2725278437137604, + "epoch": 4.47, + "learning_rate": 1.8041299359980678e-05, + "loss": 0.5232, + "step": 5293, + "task_loss": 0.17817308008670807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2974191904067993, + "epoch": 4.47, + "learning_rate": 1.8035261441854852e-05, + "loss": 0.4596, + "step": 5294, + "task_loss": 0.5277233719825745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7782220840454102, + "epoch": 4.48, + "learning_rate": 1.802922352372902e-05, + "loss": 0.4658, + "step": 5295, + "task_loss": 0.9573503732681274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3421165347099304, + "epoch": 4.48, + "learning_rate": 1.8023185605603186e-05, + "loss": 0.4512, + "step": 5296, + "task_loss": 1.1748689413070679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4987561106681824, + "epoch": 4.48, + "learning_rate": 1.801714768747736e-05, + "loss": 0.4432, + "step": 5297, + "task_loss": 0.5137851238250732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.585494875907898, + "epoch": 4.48, + "learning_rate": 1.8011109769351528e-05, + "loss": 0.6453, + "step": 5298, + "task_loss": 1.053551435470581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9408829212188721, + "epoch": 4.48, + "learning_rate": 1.8005071851225698e-05, + "loss": 0.6186, + "step": 5299, + "task_loss": 0.8208414316177368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.268271803855896, + "epoch": 4.48, + "learning_rate": 1.799903393309987e-05, + "loss": 0.404, + "step": 5300, + "task_loss": 0.4793378710746765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4603778123855591, + "epoch": 4.48, + "learning_rate": 1.7992996014974036e-05, + "loss": 0.5401, + "step": 5301, + "task_loss": 1.6211289167404175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5672343969345093, + "epoch": 4.48, + "learning_rate": 1.798695809684821e-05, + "loss": 0.5565, + "step": 5302, + "task_loss": 0.3223145306110382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5029144883155823, + "epoch": 4.48, + "learning_rate": 1.7980920178722377e-05, + "loss": 0.4643, + "step": 5303, + "task_loss": 1.3947439193725586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8028075098991394, + "epoch": 4.48, + "learning_rate": 1.7974882260596548e-05, + "loss": 0.8083, + "step": 5304, + "task_loss": 1.7739585638046265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6616266369819641, + "epoch": 4.48, + "learning_rate": 1.7968844342470718e-05, + "loss": 0.6828, + "step": 5305, + "task_loss": 0.996625542640686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3628544211387634, + "epoch": 4.48, + "learning_rate": 1.7962806424344885e-05, + "loss": 0.4394, + "step": 5306, + "task_loss": 0.2698367238044739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6246098279953003, + "epoch": 4.49, + "learning_rate": 1.7956768506219056e-05, + "loss": 0.5343, + "step": 5307, + "task_loss": 0.4993918240070343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6738573312759399, + "epoch": 4.49, + "learning_rate": 1.7950730588093226e-05, + "loss": 0.7414, + "step": 5308, + "task_loss": 0.8142508268356323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6976404190063477, + "epoch": 4.49, + "learning_rate": 1.7944692669967397e-05, + "loss": 0.4056, + "step": 5309, + "task_loss": 1.5880228281021118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3389049768447876, + "epoch": 4.49, + "learning_rate": 1.7938654751841568e-05, + "loss": 0.4895, + "step": 5310, + "task_loss": 1.3826721906661987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3764387369155884, + "epoch": 4.49, + "learning_rate": 1.7932616833715735e-05, + "loss": 0.4961, + "step": 5311, + "task_loss": 0.6057107448577881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27936801314353943, + "epoch": 4.49, + "learning_rate": 1.7926578915589905e-05, + "loss": 0.4838, + "step": 5312, + "task_loss": 0.41532421112060547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27112525701522827, + "epoch": 4.49, + "learning_rate": 1.7920540997464076e-05, + "loss": 0.4363, + "step": 5313, + "task_loss": 0.4502791166305542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6065534353256226, + "epoch": 4.49, + "learning_rate": 1.7914503079338247e-05, + "loss": 0.5053, + "step": 5314, + "task_loss": 0.835750937461853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4081474542617798, + "epoch": 4.49, + "learning_rate": 1.7908465161212414e-05, + "loss": 0.5001, + "step": 5315, + "task_loss": 0.1353079080581665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3058404326438904, + "epoch": 4.49, + "learning_rate": 1.7902427243086584e-05, + "loss": 0.4807, + "step": 5316, + "task_loss": 0.4571177363395691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.383163720369339, + "epoch": 4.49, + "learning_rate": 1.7896389324960755e-05, + "loss": 0.4695, + "step": 5317, + "task_loss": 0.12312687188386917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7906149625778198, + "epoch": 4.5, + "learning_rate": 1.7890351406834922e-05, + "loss": 0.5679, + "step": 5318, + "task_loss": 0.3419434130191803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5423780083656311, + "epoch": 4.5, + "learning_rate": 1.7884313488709096e-05, + "loss": 0.572, + "step": 5319, + "task_loss": 0.8189250826835632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6535266637802124, + "epoch": 4.5, + "learning_rate": 1.7878275570583263e-05, + "loss": 0.5148, + "step": 5320, + "task_loss": 1.0353374481201172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41892099380493164, + "epoch": 4.5, + "learning_rate": 1.7872237652457434e-05, + "loss": 0.4203, + "step": 5321, + "task_loss": 0.48897427320480347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3676598370075226, + "epoch": 4.5, + "learning_rate": 1.7866199734331604e-05, + "loss": 0.386, + "step": 5322, + "task_loss": 0.23943567276000977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5012972354888916, + "epoch": 4.5, + "learning_rate": 1.786016181620577e-05, + "loss": 0.4702, + "step": 5323, + "task_loss": 0.13851799070835114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39605340361595154, + "epoch": 4.5, + "learning_rate": 1.7854123898079945e-05, + "loss": 0.4018, + "step": 5324, + "task_loss": 0.16603189706802368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4488663673400879, + "epoch": 4.5, + "learning_rate": 1.7848085979954113e-05, + "loss": 0.5076, + "step": 5325, + "task_loss": 0.4107595682144165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.250594824552536, + "epoch": 4.5, + "learning_rate": 1.784204806182828e-05, + "loss": 0.6356, + "step": 5326, + "task_loss": 0.49086838960647583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7477624416351318, + "epoch": 4.5, + "learning_rate": 1.7836010143702454e-05, + "loss": 0.4538, + "step": 5327, + "task_loss": 0.5824536085128784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5346421003341675, + "epoch": 4.5, + "learning_rate": 1.782997222557662e-05, + "loss": 0.383, + "step": 5328, + "task_loss": 0.5682054758071899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5592981576919556, + "epoch": 4.5, + "learning_rate": 1.782393430745079e-05, + "loss": 0.4773, + "step": 5329, + "task_loss": 0.8664562702178955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42156779766082764, + "epoch": 4.51, + "learning_rate": 1.7817896389324962e-05, + "loss": 0.4397, + "step": 5330, + "task_loss": 0.08832123875617981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5465600490570068, + "epoch": 4.51, + "learning_rate": 1.781185847119913e-05, + "loss": 0.4458, + "step": 5331, + "task_loss": 1.0452375411987305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5026817917823792, + "epoch": 4.51, + "learning_rate": 1.7805820553073303e-05, + "loss": 0.5313, + "step": 5332, + "task_loss": 1.053684949874878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4082244038581848, + "epoch": 4.51, + "learning_rate": 1.779978263494747e-05, + "loss": 0.5175, + "step": 5333, + "task_loss": 0.27448105812072754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8339166045188904, + "epoch": 4.51, + "learning_rate": 1.779374471682164e-05, + "loss": 0.9317, + "step": 5334, + "task_loss": 1.4815112352371216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.528512716293335, + "epoch": 4.51, + "learning_rate": 1.778770679869581e-05, + "loss": 0.5277, + "step": 5335, + "task_loss": 1.1600215435028076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5920917987823486, + "epoch": 4.51, + "learning_rate": 1.778166888056998e-05, + "loss": 0.4269, + "step": 5336, + "task_loss": 0.8102577924728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49797794222831726, + "epoch": 4.51, + "learning_rate": 1.777563096244415e-05, + "loss": 0.4414, + "step": 5337, + "task_loss": 0.34728923439979553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6472315192222595, + "epoch": 4.51, + "learning_rate": 1.776959304431832e-05, + "loss": 0.6574, + "step": 5338, + "task_loss": 1.7491613626480103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5813652873039246, + "epoch": 4.51, + "learning_rate": 1.776355512619249e-05, + "loss": 0.4419, + "step": 5339, + "task_loss": 0.5841543674468994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3097229599952698, + "epoch": 4.51, + "learning_rate": 1.775751720806666e-05, + "loss": 0.5605, + "step": 5340, + "task_loss": 0.9844646453857422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4437071979045868, + "epoch": 4.51, + "learning_rate": 1.7751479289940828e-05, + "loss": 0.5001, + "step": 5341, + "task_loss": 0.5132348537445068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3323069214820862, + "epoch": 4.52, + "learning_rate": 1.7745441371815e-05, + "loss": 0.4748, + "step": 5342, + "task_loss": 0.05384235829114914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3070586025714874, + "epoch": 4.52, + "learning_rate": 1.773940345368917e-05, + "loss": 0.5171, + "step": 5343, + "task_loss": 0.13457843661308289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5785703659057617, + "epoch": 4.52, + "learning_rate": 1.773336553556334e-05, + "loss": 0.6213, + "step": 5344, + "task_loss": 0.82038813829422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34522831439971924, + "epoch": 4.52, + "learning_rate": 1.7727327617437507e-05, + "loss": 0.4157, + "step": 5345, + "task_loss": 0.3677099049091339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26567530632019043, + "epoch": 4.52, + "learning_rate": 1.7721289699311678e-05, + "loss": 0.5837, + "step": 5346, + "task_loss": 0.19009901583194733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5682413578033447, + "epoch": 4.52, + "learning_rate": 1.7715251781185848e-05, + "loss": 0.467, + "step": 5347, + "task_loss": 1.4593312740325928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38993000984191895, + "epoch": 4.52, + "learning_rate": 1.770921386306002e-05, + "loss": 0.5105, + "step": 5348, + "task_loss": 0.4969681203365326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7158200740814209, + "epoch": 4.52, + "learning_rate": 1.770317594493419e-05, + "loss": 0.6092, + "step": 5349, + "task_loss": 0.9733288288116455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5068963170051575, + "epoch": 4.52, + "learning_rate": 1.7697138026808357e-05, + "loss": 0.5364, + "step": 5350, + "task_loss": 0.8530793190002441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39091604948043823, + "epoch": 4.52, + "learning_rate": 1.7691100108682527e-05, + "loss": 0.5089, + "step": 5351, + "task_loss": 0.4238569438457489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22887153923511505, + "epoch": 4.52, + "learning_rate": 1.7685062190556698e-05, + "loss": 0.4738, + "step": 5352, + "task_loss": 0.1170913502573967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7772027254104614, + "epoch": 4.52, + "learning_rate": 1.7679024272430865e-05, + "loss": 0.5625, + "step": 5353, + "task_loss": 1.0978670120239258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.448789119720459, + "epoch": 4.53, + "learning_rate": 1.767298635430504e-05, + "loss": 0.479, + "step": 5354, + "task_loss": 0.906532347202301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3463450074195862, + "epoch": 4.53, + "learning_rate": 1.7666948436179206e-05, + "loss": 0.4775, + "step": 5355, + "task_loss": 0.5426546335220337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6102800369262695, + "epoch": 4.53, + "learning_rate": 1.7660910518053377e-05, + "loss": 0.5927, + "step": 5356, + "task_loss": 1.043286919593811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4140098989009857, + "epoch": 4.53, + "learning_rate": 1.7654872599927547e-05, + "loss": 0.6372, + "step": 5357, + "task_loss": 0.318030446767807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0827990770339966, + "epoch": 4.53, + "learning_rate": 1.7648834681801714e-05, + "loss": 0.6889, + "step": 5358, + "task_loss": 1.0528075695037842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0321016311645508, + "epoch": 4.53, + "learning_rate": 1.764279676367589e-05, + "loss": 0.5987, + "step": 5359, + "task_loss": 1.218461513519287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7496699094772339, + "epoch": 4.53, + "learning_rate": 1.7636758845550056e-05, + "loss": 0.5928, + "step": 5360, + "task_loss": 1.0113623142242432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44815367460250854, + "epoch": 4.53, + "learning_rate": 1.7630720927424223e-05, + "loss": 0.5364, + "step": 5361, + "task_loss": 0.4028920829296112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6147949695587158, + "epoch": 4.53, + "learning_rate": 1.7624683009298397e-05, + "loss": 0.6673, + "step": 5362, + "task_loss": 0.5058900713920593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5667515397071838, + "epoch": 4.53, + "learning_rate": 1.7618645091172564e-05, + "loss": 0.6834, + "step": 5363, + "task_loss": 1.2718034982681274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5230085253715515, + "epoch": 4.53, + "learning_rate": 1.7612607173046734e-05, + "loss": 0.5252, + "step": 5364, + "task_loss": 0.8540233969688416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5938000679016113, + "epoch": 4.53, + "learning_rate": 1.7606569254920905e-05, + "loss": 0.5357, + "step": 5365, + "task_loss": 0.6390989422798157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7102471590042114, + "epoch": 4.54, + "learning_rate": 1.7600531336795072e-05, + "loss": 0.5168, + "step": 5366, + "task_loss": 0.9288778901100159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44815492630004883, + "epoch": 4.54, + "learning_rate": 1.7594493418669246e-05, + "loss": 0.4686, + "step": 5367, + "task_loss": 0.4388757050037384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35524606704711914, + "epoch": 4.54, + "learning_rate": 1.7588455500543413e-05, + "loss": 0.3171, + "step": 5368, + "task_loss": 0.3818599283695221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3984129726886749, + "epoch": 4.54, + "learning_rate": 1.7582417582417584e-05, + "loss": 0.5666, + "step": 5369, + "task_loss": 0.576820433139801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39762088656425476, + "epoch": 4.54, + "learning_rate": 1.7576379664291754e-05, + "loss": 0.4554, + "step": 5370, + "task_loss": 0.6187169551849365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5552012920379639, + "epoch": 4.54, + "learning_rate": 1.757034174616592e-05, + "loss": 0.5831, + "step": 5371, + "task_loss": 0.601713240146637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35238274931907654, + "epoch": 4.54, + "learning_rate": 1.7564303828040092e-05, + "loss": 0.4057, + "step": 5372, + "task_loss": 0.27822574973106384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48213905096054077, + "epoch": 4.54, + "learning_rate": 1.7558265909914263e-05, + "loss": 0.436, + "step": 5373, + "task_loss": 1.175932765007019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2747301459312439, + "epoch": 4.54, + "learning_rate": 1.755222799178843e-05, + "loss": 0.4423, + "step": 5374, + "task_loss": 0.44009676575660706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6568558216094971, + "epoch": 4.54, + "learning_rate": 1.7546190073662604e-05, + "loss": 0.7501, + "step": 5375, + "task_loss": 0.5169509053230286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3097965121269226, + "epoch": 4.54, + "learning_rate": 1.754015215553677e-05, + "loss": 0.503, + "step": 5376, + "task_loss": 0.14502836763858795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5641940832138062, + "epoch": 4.54, + "learning_rate": 1.753411423741094e-05, + "loss": 0.445, + "step": 5377, + "task_loss": 0.3894634246826172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4124983251094818, + "epoch": 4.55, + "learning_rate": 1.7528076319285112e-05, + "loss": 0.3643, + "step": 5378, + "task_loss": 0.4353872537612915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46088892221450806, + "epoch": 4.55, + "learning_rate": 1.752203840115928e-05, + "loss": 0.6306, + "step": 5379, + "task_loss": 0.4199133515357971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6478526592254639, + "epoch": 4.55, + "learning_rate": 1.751600048303345e-05, + "loss": 0.538, + "step": 5380, + "task_loss": 0.6538507342338562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5436025261878967, + "epoch": 4.55, + "learning_rate": 1.750996256490762e-05, + "loss": 0.5435, + "step": 5381, + "task_loss": 0.6517281532287598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6368356943130493, + "epoch": 4.55, + "learning_rate": 1.750392464678179e-05, + "loss": 0.5425, + "step": 5382, + "task_loss": 1.0692280530929565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6070291996002197, + "epoch": 4.55, + "learning_rate": 1.749788672865596e-05, + "loss": 0.5608, + "step": 5383, + "task_loss": 1.1291602849960327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6542998552322388, + "epoch": 4.55, + "learning_rate": 1.749184881053013e-05, + "loss": 0.6098, + "step": 5384, + "task_loss": 0.4997357428073883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3936842679977417, + "epoch": 4.55, + "learning_rate": 1.74858108924043e-05, + "loss": 0.4516, + "step": 5385, + "task_loss": 0.8660577535629272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3022080659866333, + "epoch": 4.55, + "learning_rate": 1.747977297427847e-05, + "loss": 0.6486, + "step": 5386, + "task_loss": 1.1156678199768066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23548036813735962, + "epoch": 4.55, + "learning_rate": 1.747373505615264e-05, + "loss": 0.6573, + "step": 5387, + "task_loss": 1.430626392364502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4717443883419037, + "epoch": 4.55, + "learning_rate": 1.7467697138026808e-05, + "loss": 0.6065, + "step": 5388, + "task_loss": 0.7611106038093567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7046002149581909, + "epoch": 4.56, + "learning_rate": 1.746165921990098e-05, + "loss": 0.5374, + "step": 5389, + "task_loss": 1.1918392181396484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0158601999282837, + "epoch": 4.56, + "learning_rate": 1.745562130177515e-05, + "loss": 0.6643, + "step": 5390, + "task_loss": 2.2650279998779297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4283023476600647, + "epoch": 4.56, + "learning_rate": 1.7449583383649316e-05, + "loss": 0.4511, + "step": 5391, + "task_loss": 0.7552892565727234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32387951016426086, + "epoch": 4.56, + "learning_rate": 1.744354546552349e-05, + "loss": 0.441, + "step": 5392, + "task_loss": 0.5742499232292175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26102110743522644, + "epoch": 4.56, + "learning_rate": 1.7437507547397657e-05, + "loss": 0.5776, + "step": 5393, + "task_loss": 0.9389200210571289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47874292731285095, + "epoch": 4.56, + "learning_rate": 1.7431469629271828e-05, + "loss": 0.5667, + "step": 5394, + "task_loss": 0.6723522543907166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5342756509780884, + "epoch": 4.56, + "learning_rate": 1.7425431711146e-05, + "loss": 0.4889, + "step": 5395, + "task_loss": 0.4787804186344147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2845149338245392, + "epoch": 4.56, + "learning_rate": 1.7419393793020166e-05, + "loss": 0.5459, + "step": 5396, + "task_loss": 0.39839816093444824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5762616395950317, + "epoch": 4.56, + "learning_rate": 1.741335587489434e-05, + "loss": 0.6429, + "step": 5397, + "task_loss": 1.0555853843688965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3270818591117859, + "epoch": 4.56, + "learning_rate": 1.7407317956768507e-05, + "loss": 0.4264, + "step": 5398, + "task_loss": 1.3446632623672485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3082667291164398, + "epoch": 4.56, + "learning_rate": 1.7401280038642674e-05, + "loss": 0.4011, + "step": 5399, + "task_loss": 0.3053587079048157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17635950446128845, + "epoch": 4.56, + "learning_rate": 1.7395242120516848e-05, + "loss": 0.5058, + "step": 5400, + "task_loss": 0.28611624240875244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.567571759223938, + "epoch": 4.57, + "learning_rate": 1.7389204202391015e-05, + "loss": 0.4869, + "step": 5401, + "task_loss": 0.4277133047580719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6084592938423157, + "epoch": 4.57, + "learning_rate": 1.7383166284265186e-05, + "loss": 0.5504, + "step": 5402, + "task_loss": 0.44851553440093994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0547802448272705, + "epoch": 4.57, + "learning_rate": 1.7377128366139356e-05, + "loss": 0.6098, + "step": 5403, + "task_loss": 0.41734224557876587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5102459788322449, + "epoch": 4.57, + "learning_rate": 1.7371090448013523e-05, + "loss": 0.571, + "step": 5404, + "task_loss": 0.8372790813446045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4160216450691223, + "epoch": 4.57, + "learning_rate": 1.7365052529887697e-05, + "loss": 0.4595, + "step": 5405, + "task_loss": 0.07762360572814941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3123959004878998, + "epoch": 4.57, + "learning_rate": 1.7359014611761865e-05, + "loss": 0.4323, + "step": 5406, + "task_loss": 1.3092138767242432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39229950308799744, + "epoch": 4.57, + "learning_rate": 1.7352976693636035e-05, + "loss": 0.4534, + "step": 5407, + "task_loss": 1.0657851696014404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8867568969726562, + "epoch": 4.57, + "learning_rate": 1.7346938775510206e-05, + "loss": 0.6639, + "step": 5408, + "task_loss": 1.0215117931365967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49989867210388184, + "epoch": 4.57, + "learning_rate": 1.7340900857384373e-05, + "loss": 0.6464, + "step": 5409, + "task_loss": 0.24181507527828217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.15571282804012299, + "epoch": 4.57, + "learning_rate": 1.7334862939258543e-05, + "loss": 0.3863, + "step": 5410, + "task_loss": 0.4521203339099884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2486138790845871, + "epoch": 4.57, + "learning_rate": 1.7328825021132714e-05, + "loss": 0.497, + "step": 5411, + "task_loss": 0.44419437646865845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48378828167915344, + "epoch": 4.57, + "learning_rate": 1.7322787103006885e-05, + "loss": 0.502, + "step": 5412, + "task_loss": 1.1538996696472168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.272225946187973, + "epoch": 4.58, + "learning_rate": 1.7316749184881055e-05, + "loss": 0.4748, + "step": 5413, + "task_loss": 0.34107521176338196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5830373764038086, + "epoch": 4.58, + "learning_rate": 1.7310711266755222e-05, + "loss": 0.509, + "step": 5414, + "task_loss": 0.4812740385532379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4598248600959778, + "epoch": 4.58, + "learning_rate": 1.7304673348629393e-05, + "loss": 0.3967, + "step": 5415, + "task_loss": 0.45853331685066223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21499529480934143, + "epoch": 4.58, + "learning_rate": 1.7298635430503563e-05, + "loss": 0.3623, + "step": 5416, + "task_loss": 0.28228867053985596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6637423634529114, + "epoch": 4.58, + "learning_rate": 1.7292597512377734e-05, + "loss": 0.6109, + "step": 5417, + "task_loss": 1.0034449100494385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35611775517463684, + "epoch": 4.58, + "learning_rate": 1.72865595942519e-05, + "loss": 0.5057, + "step": 5418, + "task_loss": 0.7666125297546387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4196309447288513, + "epoch": 4.58, + "learning_rate": 1.7280521676126072e-05, + "loss": 0.431, + "step": 5419, + "task_loss": 0.6954057812690735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5230875015258789, + "epoch": 4.58, + "learning_rate": 1.7274483758000242e-05, + "loss": 0.452, + "step": 5420, + "task_loss": 0.4380897879600525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7252964973449707, + "epoch": 4.58, + "learning_rate": 1.7268445839874413e-05, + "loss": 0.7035, + "step": 5421, + "task_loss": 1.138519525527954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4954253137111664, + "epoch": 4.58, + "learning_rate": 1.7262407921748584e-05, + "loss": 0.5144, + "step": 5422, + "task_loss": 0.27527084946632385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1861920803785324, + "epoch": 4.58, + "learning_rate": 1.725637000362275e-05, + "loss": 0.3423, + "step": 5423, + "task_loss": 0.5326087474822998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4058866798877716, + "epoch": 4.58, + "learning_rate": 1.725033208549692e-05, + "loss": 0.5362, + "step": 5424, + "task_loss": 0.2310435026884079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33664992451667786, + "epoch": 4.59, + "learning_rate": 1.7244294167371092e-05, + "loss": 0.489, + "step": 5425, + "task_loss": 0.37981417775154114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3225569725036621, + "epoch": 4.59, + "learning_rate": 1.723825624924526e-05, + "loss": 0.4404, + "step": 5426, + "task_loss": 0.11073686182498932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5739659070968628, + "epoch": 4.59, + "learning_rate": 1.7232218331119433e-05, + "loss": 0.484, + "step": 5427, + "task_loss": 1.5879143476486206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7419968247413635, + "epoch": 4.59, + "learning_rate": 1.72261804129936e-05, + "loss": 0.4549, + "step": 5428, + "task_loss": 0.17074747383594513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5476052761077881, + "epoch": 4.59, + "learning_rate": 1.722014249486777e-05, + "loss": 0.5176, + "step": 5429, + "task_loss": 0.5691224336624146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7450308203697205, + "epoch": 4.59, + "learning_rate": 1.721410457674194e-05, + "loss": 0.4774, + "step": 5430, + "task_loss": 0.827069103717804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40565258264541626, + "epoch": 4.59, + "learning_rate": 1.720806665861611e-05, + "loss": 0.3981, + "step": 5431, + "task_loss": 0.5419377088546753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5382448434829712, + "epoch": 4.59, + "learning_rate": 1.7202028740490282e-05, + "loss": 0.4304, + "step": 5432, + "task_loss": 0.5731014609336853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2793627679347992, + "epoch": 4.59, + "learning_rate": 1.719599082236445e-05, + "loss": 0.5521, + "step": 5433, + "task_loss": 0.42362678050994873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4054490327835083, + "epoch": 4.59, + "learning_rate": 1.7189952904238617e-05, + "loss": 0.508, + "step": 5434, + "task_loss": 0.6656392812728882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5713086128234863, + "epoch": 4.59, + "learning_rate": 1.718391498611279e-05, + "loss": 0.5252, + "step": 5435, + "task_loss": 0.8863192200660706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5195351839065552, + "epoch": 4.59, + "learning_rate": 1.7177877067986958e-05, + "loss": 0.4355, + "step": 5436, + "task_loss": 1.3757165670394897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5164188146591187, + "epoch": 4.6, + "learning_rate": 1.717183914986113e-05, + "loss": 0.4118, + "step": 5437, + "task_loss": 1.4070080518722534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7834854125976562, + "epoch": 4.6, + "learning_rate": 1.71658012317353e-05, + "loss": 0.4677, + "step": 5438, + "task_loss": 0.6979140043258667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21009454131126404, + "epoch": 4.6, + "learning_rate": 1.7159763313609466e-05, + "loss": 0.3682, + "step": 5439, + "task_loss": 0.581680953502655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7120217680931091, + "epoch": 4.6, + "learning_rate": 1.715372539548364e-05, + "loss": 0.7316, + "step": 5440, + "task_loss": 0.6432896256446838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7193764448165894, + "epoch": 4.6, + "learning_rate": 1.7147687477357807e-05, + "loss": 0.5476, + "step": 5441, + "task_loss": 1.3694767951965332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38746583461761475, + "epoch": 4.6, + "learning_rate": 1.7141649559231978e-05, + "loss": 0.4159, + "step": 5442, + "task_loss": 0.6383300423622131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20137155055999756, + "epoch": 4.6, + "learning_rate": 1.713561164110615e-05, + "loss": 0.3626, + "step": 5443, + "task_loss": 0.11660199612379074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7759224772453308, + "epoch": 4.6, + "learning_rate": 1.7129573722980316e-05, + "loss": 0.6706, + "step": 5444, + "task_loss": 0.7136790156364441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3924662172794342, + "epoch": 4.6, + "learning_rate": 1.7123535804854486e-05, + "loss": 0.5078, + "step": 5445, + "task_loss": 0.24098271131515503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.872663676738739, + "epoch": 4.6, + "learning_rate": 1.7117497886728657e-05, + "loss": 0.6756, + "step": 5446, + "task_loss": 0.8768436312675476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5718363523483276, + "epoch": 4.6, + "learning_rate": 1.7111459968602827e-05, + "loss": 0.5372, + "step": 5447, + "task_loss": 0.46772676706314087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3965407907962799, + "epoch": 4.6, + "learning_rate": 1.7105422050476995e-05, + "loss": 0.402, + "step": 5448, + "task_loss": 0.4877094626426697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5396556258201599, + "epoch": 4.61, + "learning_rate": 1.7099384132351165e-05, + "loss": 0.4931, + "step": 5449, + "task_loss": 0.8469643592834473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38385307788848877, + "epoch": 4.61, + "learning_rate": 1.7093346214225336e-05, + "loss": 0.4464, + "step": 5450, + "task_loss": 0.3472599387168884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4008443355560303, + "epoch": 4.61, + "learning_rate": 1.7087308296099506e-05, + "loss": 0.4277, + "step": 5451, + "task_loss": 0.6118302345275879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37542954087257385, + "epoch": 4.61, + "learning_rate": 1.7081270377973677e-05, + "loss": 0.4545, + "step": 5452, + "task_loss": 0.6742677688598633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8103146553039551, + "epoch": 4.61, + "learning_rate": 1.7075232459847844e-05, + "loss": 0.7134, + "step": 5453, + "task_loss": 0.8821404576301575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5318930149078369, + "epoch": 4.61, + "learning_rate": 1.7069194541722015e-05, + "loss": 0.5496, + "step": 5454, + "task_loss": 1.2930275201797485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4825977683067322, + "epoch": 4.61, + "learning_rate": 1.7063156623596185e-05, + "loss": 0.6599, + "step": 5455, + "task_loss": 0.7124821543693542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19768565893173218, + "epoch": 4.61, + "learning_rate": 1.7057118705470352e-05, + "loss": 0.5225, + "step": 5456, + "task_loss": 0.0757354348897934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5554444193840027, + "epoch": 4.61, + "learning_rate": 1.7051080787344526e-05, + "loss": 0.5496, + "step": 5457, + "task_loss": 1.1591628789901733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2524852156639099, + "epoch": 4.61, + "learning_rate": 1.7045042869218694e-05, + "loss": 0.4676, + "step": 5458, + "task_loss": 0.6055819988250732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5855680108070374, + "epoch": 4.61, + "learning_rate": 1.7039004951092864e-05, + "loss": 0.6348, + "step": 5459, + "task_loss": 1.4980745315551758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2372620552778244, + "epoch": 4.61, + "learning_rate": 1.7032967032967035e-05, + "loss": 0.4694, + "step": 5460, + "task_loss": 0.34745949506759644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2282392829656601, + "epoch": 4.62, + "learning_rate": 1.7026929114841202e-05, + "loss": 0.4466, + "step": 5461, + "task_loss": 0.17458760738372803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36137285828590393, + "epoch": 4.62, + "learning_rate": 1.7020891196715376e-05, + "loss": 0.4898, + "step": 5462, + "task_loss": 1.096798300743103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8885866403579712, + "epoch": 4.62, + "learning_rate": 1.7014853278589543e-05, + "loss": 0.535, + "step": 5463, + "task_loss": 1.8363418579101562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4085941016674042, + "epoch": 4.62, + "learning_rate": 1.700881536046371e-05, + "loss": 0.4095, + "step": 5464, + "task_loss": 0.6862176060676575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2316611111164093, + "epoch": 4.62, + "learning_rate": 1.7002777442337884e-05, + "loss": 0.4696, + "step": 5465, + "task_loss": 0.4384218454360962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22684870660305023, + "epoch": 4.62, + "learning_rate": 1.699673952421205e-05, + "loss": 0.4614, + "step": 5466, + "task_loss": 0.6826119422912598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48724451661109924, + "epoch": 4.62, + "learning_rate": 1.6990701606086222e-05, + "loss": 0.3695, + "step": 5467, + "task_loss": 0.6927094459533691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6074528694152832, + "epoch": 4.62, + "learning_rate": 1.6984663687960393e-05, + "loss": 0.5865, + "step": 5468, + "task_loss": 0.263081431388855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31559231877326965, + "epoch": 4.62, + "learning_rate": 1.697862576983456e-05, + "loss": 0.4768, + "step": 5469, + "task_loss": 0.1779908388853073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.541858434677124, + "epoch": 4.62, + "learning_rate": 1.6972587851708734e-05, + "loss": 0.4705, + "step": 5470, + "task_loss": 0.8442181348800659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4459349811077118, + "epoch": 4.62, + "learning_rate": 1.69665499335829e-05, + "loss": 0.4104, + "step": 5471, + "task_loss": 0.16329926252365112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45675379037857056, + "epoch": 4.63, + "learning_rate": 1.696051201545707e-05, + "loss": 0.4997, + "step": 5472, + "task_loss": 0.768002450466156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7953561544418335, + "epoch": 4.63, + "learning_rate": 1.6954474097331242e-05, + "loss": 0.564, + "step": 5473, + "task_loss": 0.651598334312439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5013059973716736, + "epoch": 4.63, + "learning_rate": 1.694843617920541e-05, + "loss": 0.515, + "step": 5474, + "task_loss": 0.2683398127555847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41923826932907104, + "epoch": 4.63, + "learning_rate": 1.694239826107958e-05, + "loss": 0.4483, + "step": 5475, + "task_loss": 0.33310467004776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3472307324409485, + "epoch": 4.63, + "learning_rate": 1.693636034295375e-05, + "loss": 0.6053, + "step": 5476, + "task_loss": 1.075961709022522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40587639808654785, + "epoch": 4.63, + "learning_rate": 1.693032242482792e-05, + "loss": 0.5657, + "step": 5477, + "task_loss": 1.0487520694732666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4962114095687866, + "epoch": 4.63, + "learning_rate": 1.692428450670209e-05, + "loss": 0.5145, + "step": 5478, + "task_loss": 0.8609305620193481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2868644893169403, + "epoch": 4.63, + "learning_rate": 1.691824658857626e-05, + "loss": 0.3927, + "step": 5479, + "task_loss": 0.16665948927402496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8361634612083435, + "epoch": 4.63, + "learning_rate": 1.691220867045043e-05, + "loss": 0.6755, + "step": 5480, + "task_loss": 0.9524366855621338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4137452244758606, + "epoch": 4.63, + "learning_rate": 1.69061707523246e-05, + "loss": 0.5854, + "step": 5481, + "task_loss": 0.4507189393043518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5147236585617065, + "epoch": 4.63, + "learning_rate": 1.690013283419877e-05, + "loss": 0.4341, + "step": 5482, + "task_loss": 0.4382483661174774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4952126145362854, + "epoch": 4.63, + "learning_rate": 1.6894094916072938e-05, + "loss": 0.3705, + "step": 5483, + "task_loss": 1.4983782768249512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24983911216259003, + "epoch": 4.64, + "learning_rate": 1.6888056997947108e-05, + "loss": 0.3442, + "step": 5484, + "task_loss": 0.6257839798927307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6691804528236389, + "epoch": 4.64, + "learning_rate": 1.688201907982128e-05, + "loss": 0.6041, + "step": 5485, + "task_loss": 0.7354838252067566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46053797006607056, + "epoch": 4.64, + "learning_rate": 1.687598116169545e-05, + "loss": 0.4503, + "step": 5486, + "task_loss": 0.4965379238128662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6130886077880859, + "epoch": 4.64, + "learning_rate": 1.686994324356962e-05, + "loss": 0.5423, + "step": 5487, + "task_loss": 0.485504150390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4289071261882782, + "epoch": 4.64, + "learning_rate": 1.6863905325443787e-05, + "loss": 0.5484, + "step": 5488, + "task_loss": 0.6775035262107849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4593380093574524, + "epoch": 4.64, + "learning_rate": 1.6857867407317958e-05, + "loss": 0.4377, + "step": 5489, + "task_loss": 0.4332638084888458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8059954643249512, + "epoch": 4.64, + "learning_rate": 1.6851829489192128e-05, + "loss": 0.4635, + "step": 5490, + "task_loss": 1.5076130628585815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4752492308616638, + "epoch": 4.64, + "learning_rate": 1.6845791571066295e-05, + "loss": 0.5055, + "step": 5491, + "task_loss": 0.7368783354759216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4119701385498047, + "epoch": 4.64, + "learning_rate": 1.683975365294047e-05, + "loss": 0.5459, + "step": 5492, + "task_loss": 0.3374151289463043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6387439370155334, + "epoch": 4.64, + "learning_rate": 1.6833715734814636e-05, + "loss": 0.4198, + "step": 5493, + "task_loss": 0.5488913059234619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5181170701980591, + "epoch": 4.64, + "learning_rate": 1.6827677816688807e-05, + "loss": 0.5759, + "step": 5494, + "task_loss": 0.5951091647148132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4909436106681824, + "epoch": 4.64, + "learning_rate": 1.6821639898562978e-05, + "loss": 0.51, + "step": 5495, + "task_loss": 0.5649423599243164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4099913239479065, + "epoch": 4.65, + "learning_rate": 1.6815601980437145e-05, + "loss": 0.4278, + "step": 5496, + "task_loss": 1.1192233562469482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3774700164794922, + "epoch": 4.65, + "learning_rate": 1.680956406231132e-05, + "loss": 0.5683, + "step": 5497, + "task_loss": 1.0754592418670654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3625316023826599, + "epoch": 4.65, + "learning_rate": 1.6803526144185486e-05, + "loss": 0.3739, + "step": 5498, + "task_loss": 1.0995348691940308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36881405115127563, + "epoch": 4.65, + "learning_rate": 1.6797488226059653e-05, + "loss": 0.3073, + "step": 5499, + "task_loss": 0.2381659597158432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3133361041545868, + "epoch": 4.65, + "learning_rate": 1.6791450307933827e-05, + "loss": 0.444, + "step": 5500, + "task_loss": 0.5702794790267944 + }, + { + "epoch": 4.65, + "eval_accuracy": 0.9043564356435644, + "eval_loss": 0.32675039768218994, + "eval_runtime": 229.206, + "eval_samples_per_second": 110.163, + "eval_steps_per_second": 0.864, + "step": 5500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4147684574127197, + "epoch": 4.65, + "learning_rate": 1.6785412389807994e-05, + "loss": 0.4852, + "step": 5501, + "task_loss": 0.3123387098312378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41534972190856934, + "epoch": 4.65, + "learning_rate": 1.6779374471682165e-05, + "loss": 0.4718, + "step": 5502, + "task_loss": 0.49083971977233887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33565112948417664, + "epoch": 4.65, + "learning_rate": 1.6773336553556335e-05, + "loss": 0.4887, + "step": 5503, + "task_loss": 1.2359427213668823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32279524207115173, + "epoch": 4.65, + "learning_rate": 1.6767298635430503e-05, + "loss": 0.4697, + "step": 5504, + "task_loss": 0.3989332318305969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6157023310661316, + "epoch": 4.65, + "learning_rate": 1.6761260717304677e-05, + "loss": 0.6247, + "step": 5505, + "task_loss": 0.37597203254699707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5993354320526123, + "epoch": 4.65, + "learning_rate": 1.6755222799178844e-05, + "loss": 0.4321, + "step": 5506, + "task_loss": 0.4510357975959778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37996605038642883, + "epoch": 4.65, + "learning_rate": 1.6749184881053014e-05, + "loss": 0.4441, + "step": 5507, + "task_loss": 0.8775479793548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3377774655818939, + "epoch": 4.66, + "learning_rate": 1.6743146962927185e-05, + "loss": 0.4732, + "step": 5508, + "task_loss": 1.3134193420410156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49325382709503174, + "epoch": 4.66, + "learning_rate": 1.6737109044801352e-05, + "loss": 0.524, + "step": 5509, + "task_loss": 0.6207937598228455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3179996609687805, + "epoch": 4.66, + "learning_rate": 1.6731071126675523e-05, + "loss": 0.4563, + "step": 5510, + "task_loss": 0.34824472665786743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33780375123023987, + "epoch": 4.66, + "learning_rate": 1.6725033208549693e-05, + "loss": 0.5856, + "step": 5511, + "task_loss": 0.4170486330986023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.289669394493103, + "epoch": 4.66, + "learning_rate": 1.6718995290423864e-05, + "loss": 0.5084, + "step": 5512, + "task_loss": 0.02646147459745407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7914003729820251, + "epoch": 4.66, + "learning_rate": 1.671295737229803e-05, + "loss": 0.6016, + "step": 5513, + "task_loss": 0.6610104441642761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2872586250305176, + "epoch": 4.66, + "learning_rate": 1.67069194541722e-05, + "loss": 0.4729, + "step": 5514, + "task_loss": 0.3066456913948059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2636492848396301, + "epoch": 4.66, + "learning_rate": 1.6700881536046372e-05, + "loss": 0.5339, + "step": 5515, + "task_loss": 0.3916909694671631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23536480963230133, + "epoch": 4.66, + "learning_rate": 1.6694843617920543e-05, + "loss": 0.3685, + "step": 5516, + "task_loss": 0.55550217628479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41108274459838867, + "epoch": 4.66, + "learning_rate": 1.6688805699794713e-05, + "loss": 0.5015, + "step": 5517, + "task_loss": 0.629697322845459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3850550651550293, + "epoch": 4.66, + "learning_rate": 1.668276778166888e-05, + "loss": 0.3613, + "step": 5518, + "task_loss": 0.8391644954681396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.523095965385437, + "epoch": 4.66, + "learning_rate": 1.667672986354305e-05, + "loss": 0.6693, + "step": 5519, + "task_loss": 0.818298876285553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2860914468765259, + "epoch": 4.67, + "learning_rate": 1.667069194541722e-05, + "loss": 0.4147, + "step": 5520, + "task_loss": 0.8251075744628906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22926054894924164, + "epoch": 4.67, + "learning_rate": 1.666465402729139e-05, + "loss": 0.4231, + "step": 5521, + "task_loss": 0.0574369803071022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6622275114059448, + "epoch": 4.67, + "learning_rate": 1.6658616109165563e-05, + "loss": 0.6638, + "step": 5522, + "task_loss": 0.7030304074287415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5450395941734314, + "epoch": 4.67, + "learning_rate": 1.665257819103973e-05, + "loss": 0.5008, + "step": 5523, + "task_loss": 0.48319536447525024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9016443490982056, + "epoch": 4.67, + "learning_rate": 1.66465402729139e-05, + "loss": 0.7457, + "step": 5524, + "task_loss": 0.3283531665802002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35418522357940674, + "epoch": 4.67, + "learning_rate": 1.664050235478807e-05, + "loss": 0.5549, + "step": 5525, + "task_loss": 0.12774589657783508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45832329988479614, + "epoch": 4.67, + "learning_rate": 1.6634464436662238e-05, + "loss": 0.4973, + "step": 5526, + "task_loss": 0.2509722113609314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6581006050109863, + "epoch": 4.67, + "learning_rate": 1.6628426518536412e-05, + "loss": 0.4521, + "step": 5527, + "task_loss": 1.7563774585723877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36212971806526184, + "epoch": 4.67, + "learning_rate": 1.662238860041058e-05, + "loss": 0.4199, + "step": 5528, + "task_loss": 0.7862239480018616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.436812162399292, + "epoch": 4.67, + "learning_rate": 1.6616350682284747e-05, + "loss": 0.6177, + "step": 5529, + "task_loss": 0.20202237367630005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4297260046005249, + "epoch": 4.67, + "learning_rate": 1.661031276415892e-05, + "loss": 0.5178, + "step": 5530, + "task_loss": 0.6849742531776428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.719603955745697, + "epoch": 4.67, + "learning_rate": 1.6604274846033088e-05, + "loss": 0.5958, + "step": 5531, + "task_loss": 1.1233752965927124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5176264047622681, + "epoch": 4.68, + "learning_rate": 1.6598236927907258e-05, + "loss": 0.4948, + "step": 5532, + "task_loss": 0.6687624454498291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6180204153060913, + "epoch": 4.68, + "learning_rate": 1.659219900978143e-05, + "loss": 0.4425, + "step": 5533, + "task_loss": 0.12002148479223251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4559169113636017, + "epoch": 4.68, + "learning_rate": 1.6586161091655596e-05, + "loss": 0.4809, + "step": 5534, + "task_loss": 0.3982764482498169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17186826467514038, + "epoch": 4.68, + "learning_rate": 1.658012317352977e-05, + "loss": 0.5009, + "step": 5535, + "task_loss": 0.5122766494750977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40602174401283264, + "epoch": 4.68, + "learning_rate": 1.6574085255403937e-05, + "loss": 0.5229, + "step": 5536, + "task_loss": 1.0928808450698853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7302366495132446, + "epoch": 4.68, + "learning_rate": 1.6568047337278108e-05, + "loss": 0.4838, + "step": 5537, + "task_loss": 1.037413239479065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3035411238670349, + "epoch": 4.68, + "learning_rate": 1.6562009419152278e-05, + "loss": 0.4952, + "step": 5538, + "task_loss": 0.5348610877990723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4587266445159912, + "epoch": 4.68, + "learning_rate": 1.6555971501026445e-05, + "loss": 0.4972, + "step": 5539, + "task_loss": 1.2847027778625488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4258194863796234, + "epoch": 4.68, + "learning_rate": 1.6549933582900616e-05, + "loss": 0.623, + "step": 5540, + "task_loss": 0.5269262194633484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4110688865184784, + "epoch": 4.68, + "learning_rate": 1.6543895664774787e-05, + "loss": 0.3561, + "step": 5541, + "task_loss": 0.4787455201148987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23311346769332886, + "epoch": 4.68, + "learning_rate": 1.6537857746648957e-05, + "loss": 0.4193, + "step": 5542, + "task_loss": 1.0165972709655762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2708989381790161, + "epoch": 4.69, + "learning_rate": 1.6531819828523128e-05, + "loss": 0.4763, + "step": 5543, + "task_loss": 0.7854048013687134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3676983118057251, + "epoch": 4.69, + "learning_rate": 1.6525781910397295e-05, + "loss": 0.4102, + "step": 5544, + "task_loss": 0.4296169877052307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3464120924472809, + "epoch": 4.69, + "learning_rate": 1.6519743992271465e-05, + "loss": 0.4025, + "step": 5545, + "task_loss": 0.6425986886024475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5495314598083496, + "epoch": 4.69, + "learning_rate": 1.6513706074145636e-05, + "loss": 0.4297, + "step": 5546, + "task_loss": 0.7150076031684875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36066633462905884, + "epoch": 4.69, + "learning_rate": 1.6507668156019807e-05, + "loss": 0.3932, + "step": 5547, + "task_loss": 0.5827043652534485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3435899615287781, + "epoch": 4.69, + "learning_rate": 1.6501630237893974e-05, + "loss": 0.5748, + "step": 5548, + "task_loss": 0.7936814427375793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.317018061876297, + "epoch": 4.69, + "learning_rate": 1.6495592319768144e-05, + "loss": 0.3345, + "step": 5549, + "task_loss": 0.7015313506126404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8925701379776001, + "epoch": 4.69, + "learning_rate": 1.6489554401642315e-05, + "loss": 0.5696, + "step": 5550, + "task_loss": 0.6457761526107788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34813034534454346, + "epoch": 4.69, + "learning_rate": 1.6483516483516486e-05, + "loss": 0.4001, + "step": 5551, + "task_loss": 0.7111232280731201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47973620891571045, + "epoch": 4.69, + "learning_rate": 1.6477478565390653e-05, + "loss": 0.5676, + "step": 5552, + "task_loss": 0.7966150641441345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5473794937133789, + "epoch": 4.69, + "learning_rate": 1.6471440647264823e-05, + "loss": 0.5306, + "step": 5553, + "task_loss": 0.954845130443573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39358165860176086, + "epoch": 4.69, + "learning_rate": 1.6465402729138994e-05, + "loss": 0.4211, + "step": 5554, + "task_loss": 0.22330710291862488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37325403094291687, + "epoch": 4.7, + "learning_rate": 1.6459364811013164e-05, + "loss": 0.4607, + "step": 5555, + "task_loss": 0.5965156555175781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5200329422950745, + "epoch": 4.7, + "learning_rate": 1.645332689288733e-05, + "loss": 0.5767, + "step": 5556, + "task_loss": 0.55033940076828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5287368297576904, + "epoch": 4.7, + "learning_rate": 1.6447288974761502e-05, + "loss": 0.5875, + "step": 5557, + "task_loss": 0.3366183638572693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5582667589187622, + "epoch": 4.7, + "learning_rate": 1.6441251056635673e-05, + "loss": 0.5322, + "step": 5558, + "task_loss": 0.8144546747207642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5114145278930664, + "epoch": 4.7, + "learning_rate": 1.6435213138509843e-05, + "loss": 0.4241, + "step": 5559, + "task_loss": 0.6799103617668152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48670458793640137, + "epoch": 4.7, + "learning_rate": 1.6429175220384014e-05, + "loss": 0.4541, + "step": 5560, + "task_loss": 0.7209588885307312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3843807876110077, + "epoch": 4.7, + "learning_rate": 1.642313730225818e-05, + "loss": 0.4801, + "step": 5561, + "task_loss": 0.5228511095046997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7493880987167358, + "epoch": 4.7, + "learning_rate": 1.641709938413235e-05, + "loss": 0.6059, + "step": 5562, + "task_loss": 0.8271488547325134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43549206852912903, + "epoch": 4.7, + "learning_rate": 1.6411061466006522e-05, + "loss": 0.4738, + "step": 5563, + "task_loss": 0.49297448992729187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5315837264060974, + "epoch": 4.7, + "learning_rate": 1.640502354788069e-05, + "loss": 0.635, + "step": 5564, + "task_loss": 0.13145673274993896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32037752866744995, + "epoch": 4.7, + "learning_rate": 1.6398985629754863e-05, + "loss": 0.5194, + "step": 5565, + "task_loss": 0.5963384509086609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39566391706466675, + "epoch": 4.7, + "learning_rate": 1.639294771162903e-05, + "loss": 0.5598, + "step": 5566, + "task_loss": 0.6603891253471375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46277955174446106, + "epoch": 4.71, + "learning_rate": 1.63869097935032e-05, + "loss": 0.4827, + "step": 5567, + "task_loss": 0.6127215027809143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6226282119750977, + "epoch": 4.71, + "learning_rate": 1.638087187537737e-05, + "loss": 0.6866, + "step": 5568, + "task_loss": 0.1706911027431488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4275314509868622, + "epoch": 4.71, + "learning_rate": 1.637483395725154e-05, + "loss": 0.5025, + "step": 5569, + "task_loss": 0.3638128638267517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38007891178131104, + "epoch": 4.71, + "learning_rate": 1.636879603912571e-05, + "loss": 0.3946, + "step": 5570, + "task_loss": 0.5417359471321106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8141539096832275, + "epoch": 4.71, + "learning_rate": 1.636275812099988e-05, + "loss": 0.6436, + "step": 5571, + "task_loss": 0.6862582564353943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1129305362701416, + "epoch": 4.71, + "learning_rate": 1.6356720202874047e-05, + "loss": 0.6323, + "step": 5572, + "task_loss": 0.851077139377594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5597401261329651, + "epoch": 4.71, + "learning_rate": 1.635068228474822e-05, + "loss": 0.4722, + "step": 5573, + "task_loss": 0.8088442087173462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4983350336551666, + "epoch": 4.71, + "learning_rate": 1.634464436662239e-05, + "loss": 0.4478, + "step": 5574, + "task_loss": 0.627778947353363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40715292096138, + "epoch": 4.71, + "learning_rate": 1.633860644849656e-05, + "loss": 0.5729, + "step": 5575, + "task_loss": 0.929154634475708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5212172865867615, + "epoch": 4.71, + "learning_rate": 1.633256853037073e-05, + "loss": 0.4585, + "step": 5576, + "task_loss": 0.14736983180046082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3323495388031006, + "epoch": 4.71, + "learning_rate": 1.6326530612244897e-05, + "loss": 0.4718, + "step": 5577, + "task_loss": 0.6321436166763306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5940913558006287, + "epoch": 4.71, + "learning_rate": 1.6320492694119067e-05, + "loss": 0.4797, + "step": 5578, + "task_loss": 0.5767788887023926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.536165177822113, + "epoch": 4.72, + "learning_rate": 1.6314454775993238e-05, + "loss": 0.564, + "step": 5579, + "task_loss": 1.1479891538619995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5259643197059631, + "epoch": 4.72, + "learning_rate": 1.630841685786741e-05, + "loss": 0.5189, + "step": 5580, + "task_loss": 0.16334594786167145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31547582149505615, + "epoch": 4.72, + "learning_rate": 1.630237893974158e-05, + "loss": 0.5224, + "step": 5581, + "task_loss": 0.10409452766180038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32807981967926025, + "epoch": 4.72, + "learning_rate": 1.6296341021615746e-05, + "loss": 0.4363, + "step": 5582, + "task_loss": 0.48191022872924805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6458097696304321, + "epoch": 4.72, + "learning_rate": 1.6290303103489917e-05, + "loss": 0.715, + "step": 5583, + "task_loss": 0.6684753894805908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5509357452392578, + "epoch": 4.72, + "learning_rate": 1.6284265185364087e-05, + "loss": 0.5419, + "step": 5584, + "task_loss": 1.3463982343673706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6108837127685547, + "epoch": 4.72, + "learning_rate": 1.6278227267238258e-05, + "loss": 0.4388, + "step": 5585, + "task_loss": 1.5121443271636963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42857879400253296, + "epoch": 4.72, + "learning_rate": 1.6272189349112425e-05, + "loss": 0.4971, + "step": 5586, + "task_loss": 0.5821583867073059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19345209002494812, + "epoch": 4.72, + "learning_rate": 1.6266151430986596e-05, + "loss": 0.4361, + "step": 5587, + "task_loss": 0.31440532207489014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43258535861968994, + "epoch": 4.72, + "learning_rate": 1.6260113512860766e-05, + "loss": 0.4752, + "step": 5588, + "task_loss": 1.4167523384094238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5530897378921509, + "epoch": 4.72, + "learning_rate": 1.6254075594734937e-05, + "loss": 0.4815, + "step": 5589, + "task_loss": 0.5023097395896912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4738820791244507, + "epoch": 4.72, + "learning_rate": 1.6248037676609107e-05, + "loss": 0.4031, + "step": 5590, + "task_loss": 0.3816971182823181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39053091406822205, + "epoch": 4.73, + "learning_rate": 1.6241999758483274e-05, + "loss": 0.5554, + "step": 5591, + "task_loss": 1.148266077041626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4704574942588806, + "epoch": 4.73, + "learning_rate": 1.6235961840357445e-05, + "loss": 0.465, + "step": 5592, + "task_loss": 0.7403675317764282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5168968439102173, + "epoch": 4.73, + "learning_rate": 1.6229923922231616e-05, + "loss": 0.4938, + "step": 5593, + "task_loss": 0.9816514849662781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5262459516525269, + "epoch": 4.73, + "learning_rate": 1.6223886004105783e-05, + "loss": 0.6371, + "step": 5594, + "task_loss": 0.9309276342391968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5352354049682617, + "epoch": 4.73, + "learning_rate": 1.6217848085979957e-05, + "loss": 0.6131, + "step": 5595, + "task_loss": 0.4466126263141632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.284623384475708, + "epoch": 4.73, + "learning_rate": 1.6211810167854124e-05, + "loss": 0.4867, + "step": 5596, + "task_loss": 0.44486290216445923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4374670386314392, + "epoch": 4.73, + "learning_rate": 1.6205772249728295e-05, + "loss": 0.4941, + "step": 5597, + "task_loss": 1.2740380764007568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46359074115753174, + "epoch": 4.73, + "learning_rate": 1.6199734331602465e-05, + "loss": 0.4302, + "step": 5598, + "task_loss": 0.6965603828430176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4394988417625427, + "epoch": 4.73, + "learning_rate": 1.6193696413476632e-05, + "loss": 0.429, + "step": 5599, + "task_loss": 0.44126826524734497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6105198264122009, + "epoch": 4.73, + "learning_rate": 1.6187658495350806e-05, + "loss": 0.5792, + "step": 5600, + "task_loss": 0.6733864545822144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5707597136497498, + "epoch": 4.73, + "learning_rate": 1.6181620577224973e-05, + "loss": 0.4944, + "step": 5601, + "task_loss": 0.34913721680641174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4054502546787262, + "epoch": 4.73, + "learning_rate": 1.617558265909914e-05, + "loss": 0.5655, + "step": 5602, + "task_loss": 1.294619083404541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5315719246864319, + "epoch": 4.74, + "learning_rate": 1.6169544740973315e-05, + "loss": 0.437, + "step": 5603, + "task_loss": 0.06435100734233856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3744083642959595, + "epoch": 4.74, + "learning_rate": 1.6163506822847482e-05, + "loss": 0.4109, + "step": 5604, + "task_loss": 0.4203505516052246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3453739881515503, + "epoch": 4.74, + "learning_rate": 1.6157468904721652e-05, + "loss": 0.5286, + "step": 5605, + "task_loss": 1.4459477663040161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3642445504665375, + "epoch": 4.74, + "learning_rate": 1.6151430986595823e-05, + "loss": 0.5754, + "step": 5606, + "task_loss": 0.5929403305053711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7598825693130493, + "epoch": 4.74, + "learning_rate": 1.614539306846999e-05, + "loss": 0.5458, + "step": 5607, + "task_loss": 0.3928650915622711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7772847414016724, + "epoch": 4.74, + "learning_rate": 1.6139355150344164e-05, + "loss": 0.5578, + "step": 5608, + "task_loss": 1.1732648611068726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5160415768623352, + "epoch": 4.74, + "learning_rate": 1.613331723221833e-05, + "loss": 0.5886, + "step": 5609, + "task_loss": 0.4166787266731262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7778906226158142, + "epoch": 4.74, + "learning_rate": 1.6127279314092502e-05, + "loss": 0.6079, + "step": 5610, + "task_loss": 1.2076853513717651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.592951774597168, + "epoch": 4.74, + "learning_rate": 1.6121241395966672e-05, + "loss": 0.5109, + "step": 5611, + "task_loss": 0.8812345862388611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5346471071243286, + "epoch": 4.74, + "learning_rate": 1.611520347784084e-05, + "loss": 0.5163, + "step": 5612, + "task_loss": 1.240616798400879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7768629789352417, + "epoch": 4.74, + "learning_rate": 1.610916555971501e-05, + "loss": 0.4074, + "step": 5613, + "task_loss": 0.7720212340354919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4200711250305176, + "epoch": 4.75, + "learning_rate": 1.610312764158918e-05, + "loss": 0.7311, + "step": 5614, + "task_loss": 0.4565313458442688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5321477055549622, + "epoch": 4.75, + "learning_rate": 1.609708972346335e-05, + "loss": 0.5121, + "step": 5615, + "task_loss": 1.1981844902038574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5908830165863037, + "epoch": 4.75, + "learning_rate": 1.6091051805337522e-05, + "loss": 0.515, + "step": 5616, + "task_loss": 1.3312439918518066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42604202032089233, + "epoch": 4.75, + "learning_rate": 1.608501388721169e-05, + "loss": 0.4352, + "step": 5617, + "task_loss": 1.0120296478271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5041521191596985, + "epoch": 4.75, + "learning_rate": 1.607897596908586e-05, + "loss": 0.4305, + "step": 5618, + "task_loss": 0.36833247542381287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5687348246574402, + "epoch": 4.75, + "learning_rate": 1.607293805096003e-05, + "loss": 0.5372, + "step": 5619, + "task_loss": 1.5214262008666992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5037246942520142, + "epoch": 4.75, + "learning_rate": 1.60669001328342e-05, + "loss": 0.4808, + "step": 5620, + "task_loss": 0.48083245754241943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.466033011674881, + "epoch": 4.75, + "learning_rate": 1.6060862214708368e-05, + "loss": 0.5682, + "step": 5621, + "task_loss": 0.7249603271484375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40441203117370605, + "epoch": 4.75, + "learning_rate": 1.605482429658254e-05, + "loss": 0.596, + "step": 5622, + "task_loss": 0.49672913551330566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5674706697463989, + "epoch": 4.75, + "learning_rate": 1.604878637845671e-05, + "loss": 0.4534, + "step": 5623, + "task_loss": 0.5650836229324341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38541892170906067, + "epoch": 4.75, + "learning_rate": 1.604274846033088e-05, + "loss": 0.4972, + "step": 5624, + "task_loss": 0.569334864616394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4885154366493225, + "epoch": 4.75, + "learning_rate": 1.603671054220505e-05, + "loss": 0.6367, + "step": 5625, + "task_loss": 1.1376960277557373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5204009413719177, + "epoch": 4.76, + "learning_rate": 1.6030672624079217e-05, + "loss": 0.4242, + "step": 5626, + "task_loss": 0.4900137484073639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3744223713874817, + "epoch": 4.76, + "learning_rate": 1.6024634705953388e-05, + "loss": 0.6107, + "step": 5627, + "task_loss": 0.6716447472572327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5073534250259399, + "epoch": 4.76, + "learning_rate": 1.601859678782756e-05, + "loss": 0.3355, + "step": 5628, + "task_loss": 0.5383113622665405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5712324380874634, + "epoch": 4.76, + "learning_rate": 1.6012558869701726e-05, + "loss": 0.5572, + "step": 5629, + "task_loss": 0.4822181165218353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6921467781066895, + "epoch": 4.76, + "learning_rate": 1.60065209515759e-05, + "loss": 0.5376, + "step": 5630, + "task_loss": 0.7090402245521545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6182292103767395, + "epoch": 4.76, + "learning_rate": 1.6000483033450067e-05, + "loss": 0.6534, + "step": 5631, + "task_loss": 0.5375955104827881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5978118181228638, + "epoch": 4.76, + "learning_rate": 1.5994445115324237e-05, + "loss": 0.5421, + "step": 5632, + "task_loss": 0.7081409096717834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30184221267700195, + "epoch": 4.76, + "learning_rate": 1.5988407197198408e-05, + "loss": 0.3536, + "step": 5633, + "task_loss": 0.3410731256008148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47292661666870117, + "epoch": 4.76, + "learning_rate": 1.5982369279072575e-05, + "loss": 0.4276, + "step": 5634, + "task_loss": 1.0052160024642944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3137657046318054, + "epoch": 4.76, + "learning_rate": 1.5976331360946746e-05, + "loss": 0.396, + "step": 5635, + "task_loss": 0.4215075969696045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49811023473739624, + "epoch": 4.76, + "learning_rate": 1.5970293442820916e-05, + "loss": 0.4922, + "step": 5636, + "task_loss": 0.48547983169555664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7490163445472717, + "epoch": 4.76, + "learning_rate": 1.5964255524695084e-05, + "loss": 0.5161, + "step": 5637, + "task_loss": 0.8177903294563293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4051777124404907, + "epoch": 4.77, + "learning_rate": 1.5958217606569257e-05, + "loss": 0.6321, + "step": 5638, + "task_loss": 0.6814428567886353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45394420623779297, + "epoch": 4.77, + "learning_rate": 1.5952179688443425e-05, + "loss": 0.4648, + "step": 5639, + "task_loss": 0.8112760186195374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6142861843109131, + "epoch": 4.77, + "learning_rate": 1.5946141770317595e-05, + "loss": 0.5072, + "step": 5640, + "task_loss": 1.8828442096710205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4361482560634613, + "epoch": 4.77, + "learning_rate": 1.5940103852191766e-05, + "loss": 0.4997, + "step": 5641, + "task_loss": 0.7529924511909485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21560516953468323, + "epoch": 4.77, + "learning_rate": 1.5934065934065933e-05, + "loss": 0.5234, + "step": 5642, + "task_loss": 0.21767646074295044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9339041113853455, + "epoch": 4.77, + "learning_rate": 1.5928028015940104e-05, + "loss": 0.7083, + "step": 5643, + "task_loss": 1.0546343326568604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27588972449302673, + "epoch": 4.77, + "learning_rate": 1.5921990097814274e-05, + "loss": 0.4961, + "step": 5644, + "task_loss": 0.8762207627296448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4512467086315155, + "epoch": 4.77, + "learning_rate": 1.5915952179688445e-05, + "loss": 0.4759, + "step": 5645, + "task_loss": 0.8578436970710754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19333422183990479, + "epoch": 4.77, + "learning_rate": 1.5909914261562615e-05, + "loss": 0.4834, + "step": 5646, + "task_loss": 0.2782150208950043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2531431317329407, + "epoch": 4.77, + "learning_rate": 1.5903876343436782e-05, + "loss": 0.6147, + "step": 5647, + "task_loss": 0.7841020226478577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6769366264343262, + "epoch": 4.77, + "learning_rate": 1.5897838425310953e-05, + "loss": 0.5867, + "step": 5648, + "task_loss": 0.6421873569488525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4271872341632843, + "epoch": 4.77, + "learning_rate": 1.5891800507185124e-05, + "loss": 0.6318, + "step": 5649, + "task_loss": 0.5544843673706055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2765531539916992, + "epoch": 4.78, + "learning_rate": 1.5885762589059294e-05, + "loss": 0.3965, + "step": 5650, + "task_loss": 0.1339876651763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24641792476177216, + "epoch": 4.78, + "learning_rate": 1.587972467093346e-05, + "loss": 0.4109, + "step": 5651, + "task_loss": 0.1329040825366974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23534739017486572, + "epoch": 4.78, + "learning_rate": 1.5873686752807632e-05, + "loss": 0.4167, + "step": 5652, + "task_loss": 1.1230051517486572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3777313828468323, + "epoch": 4.78, + "learning_rate": 1.5867648834681802e-05, + "loss": 0.4568, + "step": 5653, + "task_loss": 0.6795060038566589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3452783524990082, + "epoch": 4.78, + "learning_rate": 1.5861610916555973e-05, + "loss": 0.3956, + "step": 5654, + "task_loss": 0.22551333904266357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6292358040809631, + "epoch": 4.78, + "learning_rate": 1.5855572998430144e-05, + "loss": 0.6387, + "step": 5655, + "task_loss": 0.9434925317764282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31812524795532227, + "epoch": 4.78, + "learning_rate": 1.584953508030431e-05, + "loss": 0.5439, + "step": 5656, + "task_loss": 1.0697795152664185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5132414102554321, + "epoch": 4.78, + "learning_rate": 1.584349716217848e-05, + "loss": 0.4892, + "step": 5657, + "task_loss": 0.2622401714324951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5250824093818665, + "epoch": 4.78, + "learning_rate": 1.5837459244052652e-05, + "loss": 0.5517, + "step": 5658, + "task_loss": 1.0126469135284424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.054580569267273, + "epoch": 4.78, + "learning_rate": 1.583142132592682e-05, + "loss": 0.5761, + "step": 5659, + "task_loss": 0.16966791450977325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6412950754165649, + "epoch": 4.78, + "learning_rate": 1.5825383407800993e-05, + "loss": 0.5976, + "step": 5660, + "task_loss": 0.4650421142578125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20212990045547485, + "epoch": 4.78, + "learning_rate": 1.581934548967516e-05, + "loss": 0.4795, + "step": 5661, + "task_loss": 0.4432717561721802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45405086874961853, + "epoch": 4.79, + "learning_rate": 1.581330757154933e-05, + "loss": 0.4861, + "step": 5662, + "task_loss": 0.09262710809707642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.641856849193573, + "epoch": 4.79, + "learning_rate": 1.58072696534235e-05, + "loss": 0.4098, + "step": 5663, + "task_loss": 0.5284962058067322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5719764828681946, + "epoch": 4.79, + "learning_rate": 1.580123173529767e-05, + "loss": 0.4695, + "step": 5664, + "task_loss": 0.8492804169654846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25218066573143005, + "epoch": 4.79, + "learning_rate": 1.5795193817171843e-05, + "loss": 0.3622, + "step": 5665, + "task_loss": 0.7229119539260864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5369490385055542, + "epoch": 4.79, + "learning_rate": 1.578915589904601e-05, + "loss": 0.6206, + "step": 5666, + "task_loss": 0.2394445538520813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27831780910491943, + "epoch": 4.79, + "learning_rate": 1.5783117980920177e-05, + "loss": 0.4182, + "step": 5667, + "task_loss": 0.36029720306396484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6309812664985657, + "epoch": 4.79, + "learning_rate": 1.577708006279435e-05, + "loss": 0.5437, + "step": 5668, + "task_loss": 0.5753858685493469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5308962464332581, + "epoch": 4.79, + "learning_rate": 1.5771042144668518e-05, + "loss": 0.4824, + "step": 5669, + "task_loss": 0.33407866954803467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3617568612098694, + "epoch": 4.79, + "learning_rate": 1.576500422654269e-05, + "loss": 0.4649, + "step": 5670, + "task_loss": 0.6683066487312317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.122196078300476, + "epoch": 4.79, + "learning_rate": 1.575896630841686e-05, + "loss": 0.8524, + "step": 5671, + "task_loss": 1.0444464683532715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3187519609928131, + "epoch": 4.79, + "learning_rate": 1.5752928390291026e-05, + "loss": 0.4193, + "step": 5672, + "task_loss": 0.5669804215431213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2780523896217346, + "epoch": 4.79, + "learning_rate": 1.57468904721652e-05, + "loss": 0.4899, + "step": 5673, + "task_loss": 0.7641419172286987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26874101161956787, + "epoch": 4.8, + "learning_rate": 1.5740852554039368e-05, + "loss": 0.6131, + "step": 5674, + "task_loss": 0.6044903993606567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29261094331741333, + "epoch": 4.8, + "learning_rate": 1.5734814635913538e-05, + "loss": 0.5649, + "step": 5675, + "task_loss": 0.6579650044441223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3985699415206909, + "epoch": 4.8, + "learning_rate": 1.572877671778771e-05, + "loss": 0.4611, + "step": 5676, + "task_loss": 0.3705599009990692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5954338908195496, + "epoch": 4.8, + "learning_rate": 1.5722738799661876e-05, + "loss": 0.518, + "step": 5677, + "task_loss": 0.6047748923301697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6246927976608276, + "epoch": 4.8, + "learning_rate": 1.5716700881536046e-05, + "loss": 0.582, + "step": 5678, + "task_loss": 0.9215129017829895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3362606465816498, + "epoch": 4.8, + "learning_rate": 1.5710662963410217e-05, + "loss": 0.4873, + "step": 5679, + "task_loss": 1.6423377990722656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3829401433467865, + "epoch": 4.8, + "learning_rate": 1.5704625045284388e-05, + "loss": 0.4536, + "step": 5680, + "task_loss": 1.2405848503112793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37810492515563965, + "epoch": 4.8, + "learning_rate": 1.5698587127158558e-05, + "loss": 0.6177, + "step": 5681, + "task_loss": 0.7280991077423096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.719547688961029, + "epoch": 4.8, + "learning_rate": 1.5692549209032725e-05, + "loss": 0.663, + "step": 5682, + "task_loss": 0.48877912759780884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2729325592517853, + "epoch": 4.8, + "learning_rate": 1.5686511290906896e-05, + "loss": 0.4549, + "step": 5683, + "task_loss": 0.061100929975509644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4527157247066498, + "epoch": 4.8, + "learning_rate": 1.5680473372781066e-05, + "loss": 0.4292, + "step": 5684, + "task_loss": 0.6397699117660522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3957953155040741, + "epoch": 4.81, + "learning_rate": 1.5674435454655237e-05, + "loss": 0.4408, + "step": 5685, + "task_loss": 0.9481605887413025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3514915704727173, + "epoch": 4.81, + "learning_rate": 1.5668397536529404e-05, + "loss": 0.4833, + "step": 5686, + "task_loss": 0.7950453758239746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2876395285129547, + "epoch": 4.81, + "learning_rate": 1.5662359618403575e-05, + "loss": 0.3692, + "step": 5687, + "task_loss": 0.2596665918827057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5985586643218994, + "epoch": 4.81, + "learning_rate": 1.5656321700277745e-05, + "loss": 0.5823, + "step": 5688, + "task_loss": 0.5179651379585266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7245669960975647, + "epoch": 4.81, + "learning_rate": 1.5650283782151916e-05, + "loss": 0.471, + "step": 5689, + "task_loss": 1.018504023551941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4200316369533539, + "epoch": 4.81, + "learning_rate": 1.5644245864026087e-05, + "loss": 0.4043, + "step": 5690, + "task_loss": 0.4716778099536896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5841517448425293, + "epoch": 4.81, + "learning_rate": 1.5638207945900254e-05, + "loss": 0.5514, + "step": 5691, + "task_loss": 1.409725308418274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42265236377716064, + "epoch": 4.81, + "learning_rate": 1.5632170027774424e-05, + "loss": 0.6344, + "step": 5692, + "task_loss": 1.5041499137878418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0094242095947266, + "epoch": 4.81, + "learning_rate": 1.5626132109648595e-05, + "loss": 0.6298, + "step": 5693, + "task_loss": 1.2930750846862793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6513161659240723, + "epoch": 4.81, + "learning_rate": 1.5620094191522762e-05, + "loss": 0.5954, + "step": 5694, + "task_loss": 0.5762127637863159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5854684114456177, + "epoch": 4.81, + "learning_rate": 1.5614056273396936e-05, + "loss": 0.5761, + "step": 5695, + "task_loss": 1.0394959449768066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4219217002391815, + "epoch": 4.81, + "learning_rate": 1.5608018355271103e-05, + "loss": 0.4843, + "step": 5696, + "task_loss": 0.49058088660240173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5715158581733704, + "epoch": 4.82, + "learning_rate": 1.5601980437145274e-05, + "loss": 0.5053, + "step": 5697, + "task_loss": 0.7075931429862976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2913628816604614, + "epoch": 4.82, + "learning_rate": 1.5595942519019444e-05, + "loss": 0.418, + "step": 5698, + "task_loss": 1.1778804063796997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5293245315551758, + "epoch": 4.82, + "learning_rate": 1.558990460089361e-05, + "loss": 0.6095, + "step": 5699, + "task_loss": 1.0530405044555664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.467443585395813, + "epoch": 4.82, + "learning_rate": 1.5583866682767782e-05, + "loss": 0.4423, + "step": 5700, + "task_loss": 0.5259082317352295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5770244002342224, + "epoch": 4.82, + "learning_rate": 1.5577828764641953e-05, + "loss": 0.4712, + "step": 5701, + "task_loss": 0.44420406222343445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3809109032154083, + "epoch": 4.82, + "learning_rate": 1.557179084651612e-05, + "loss": 0.4773, + "step": 5702, + "task_loss": 0.27997690439224243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4990959167480469, + "epoch": 4.82, + "learning_rate": 1.5565752928390294e-05, + "loss": 0.4796, + "step": 5703, + "task_loss": 0.8982170224189758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9252793788909912, + "epoch": 4.82, + "learning_rate": 1.555971501026446e-05, + "loss": 0.492, + "step": 5704, + "task_loss": 0.9382551312446594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6022292375564575, + "epoch": 4.82, + "learning_rate": 1.555367709213863e-05, + "loss": 0.4542, + "step": 5705, + "task_loss": 0.13141119480133057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5711396932601929, + "epoch": 4.82, + "learning_rate": 1.5547639174012802e-05, + "loss": 0.5575, + "step": 5706, + "task_loss": 0.6998443603515625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39606767892837524, + "epoch": 4.82, + "learning_rate": 1.554160125588697e-05, + "loss": 0.6169, + "step": 5707, + "task_loss": 0.775436282157898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.716209888458252, + "epoch": 4.82, + "learning_rate": 1.553556333776114e-05, + "loss": 0.6187, + "step": 5708, + "task_loss": 0.8288116455078125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6977271437644958, + "epoch": 4.83, + "learning_rate": 1.552952541963531e-05, + "loss": 0.544, + "step": 5709, + "task_loss": 0.32154110074043274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5062721967697144, + "epoch": 4.83, + "learning_rate": 1.552348750150948e-05, + "loss": 0.6182, + "step": 5710, + "task_loss": 0.8445614576339722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.415189266204834, + "epoch": 4.83, + "learning_rate": 1.551744958338365e-05, + "loss": 0.504, + "step": 5711, + "task_loss": 0.7094087600708008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.603539764881134, + "epoch": 4.83, + "learning_rate": 1.551141166525782e-05, + "loss": 0.5029, + "step": 5712, + "task_loss": 0.07858951389789581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47042083740234375, + "epoch": 4.83, + "learning_rate": 1.550537374713199e-05, + "loss": 0.6009, + "step": 5713, + "task_loss": 0.8537420630455017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.437353253364563, + "epoch": 4.83, + "learning_rate": 1.549933582900616e-05, + "loss": 0.514, + "step": 5714, + "task_loss": 0.22918996214866638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0176355838775635, + "epoch": 4.83, + "learning_rate": 1.549329791088033e-05, + "loss": 0.5532, + "step": 5715, + "task_loss": 0.4721285402774811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36173272132873535, + "epoch": 4.83, + "learning_rate": 1.5487259992754498e-05, + "loss": 0.581, + "step": 5716, + "task_loss": 0.39891916513442993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4689919948577881, + "epoch": 4.83, + "learning_rate": 1.5481222074628668e-05, + "loss": 0.7947, + "step": 5717, + "task_loss": 0.5248963236808777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29008325934410095, + "epoch": 4.83, + "learning_rate": 1.547518415650284e-05, + "loss": 0.4691, + "step": 5718, + "task_loss": 0.43997618556022644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5842678546905518, + "epoch": 4.83, + "learning_rate": 1.546914623837701e-05, + "loss": 0.554, + "step": 5719, + "task_loss": 0.33819183707237244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.609019935131073, + "epoch": 4.83, + "learning_rate": 1.546310832025118e-05, + "loss": 0.6015, + "step": 5720, + "task_loss": 0.6360526084899902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5081681609153748, + "epoch": 4.84, + "learning_rate": 1.5457070402125347e-05, + "loss": 0.4845, + "step": 5721, + "task_loss": 1.3137153387069702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.287191778421402, + "epoch": 4.84, + "learning_rate": 1.5451032483999518e-05, + "loss": 0.4253, + "step": 5722, + "task_loss": 0.13402938842773438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38607972860336304, + "epoch": 4.84, + "learning_rate": 1.5444994565873688e-05, + "loss": 0.4028, + "step": 5723, + "task_loss": 0.103619284927845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5350126624107361, + "epoch": 4.84, + "learning_rate": 1.5438956647747855e-05, + "loss": 0.5525, + "step": 5724, + "task_loss": 1.3010185956954956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5932537317276001, + "epoch": 4.84, + "learning_rate": 1.543291872962203e-05, + "loss": 0.4499, + "step": 5725, + "task_loss": 0.9055598974227905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5364070534706116, + "epoch": 4.84, + "learning_rate": 1.5426880811496197e-05, + "loss": 0.5939, + "step": 5726, + "task_loss": 0.5620115995407104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5883762836456299, + "epoch": 4.84, + "learning_rate": 1.5420842893370367e-05, + "loss": 0.5198, + "step": 5727, + "task_loss": 1.2068294286727905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45011019706726074, + "epoch": 4.84, + "learning_rate": 1.5414804975244538e-05, + "loss": 0.6943, + "step": 5728, + "task_loss": 0.19900186359882355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4132983386516571, + "epoch": 4.84, + "learning_rate": 1.5408767057118705e-05, + "loss": 0.3591, + "step": 5729, + "task_loss": 0.4765997529029846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5018225312232971, + "epoch": 4.84, + "learning_rate": 1.5402729138992875e-05, + "loss": 0.5777, + "step": 5730, + "task_loss": 0.49483081698417664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30368196964263916, + "epoch": 4.84, + "learning_rate": 1.5396691220867046e-05, + "loss": 0.5301, + "step": 5731, + "task_loss": 0.8032054901123047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7635002136230469, + "epoch": 4.84, + "learning_rate": 1.5390653302741213e-05, + "loss": 0.68, + "step": 5732, + "task_loss": 1.0857393741607666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47251012921333313, + "epoch": 4.85, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.6151, + "step": 5733, + "task_loss": 1.1110893487930298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36363285779953003, + "epoch": 4.85, + "learning_rate": 1.5378577466489554e-05, + "loss": 0.3098, + "step": 5734, + "task_loss": 0.8031936883926392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5794199705123901, + "epoch": 4.85, + "learning_rate": 1.5372539548363725e-05, + "loss": 0.516, + "step": 5735, + "task_loss": 1.2402923107147217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1697910875082016, + "epoch": 4.85, + "learning_rate": 1.5366501630237896e-05, + "loss": 0.4513, + "step": 5736, + "task_loss": 0.2355843037366867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28572651743888855, + "epoch": 4.85, + "learning_rate": 1.5360463712112063e-05, + "loss": 0.4089, + "step": 5737, + "task_loss": 0.22135579586029053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6795411109924316, + "epoch": 4.85, + "learning_rate": 1.5354425793986237e-05, + "loss": 0.6523, + "step": 5738, + "task_loss": 0.8354735970497131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6474465131759644, + "epoch": 4.85, + "learning_rate": 1.5348387875860404e-05, + "loss": 0.5865, + "step": 5739, + "task_loss": 0.654369056224823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5395057201385498, + "epoch": 4.85, + "learning_rate": 1.534234995773457e-05, + "loss": 0.5646, + "step": 5740, + "task_loss": 0.4729509651660919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3919467329978943, + "epoch": 4.85, + "learning_rate": 1.5336312039608745e-05, + "loss": 0.427, + "step": 5741, + "task_loss": 0.8063421845436096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.440171480178833, + "epoch": 4.85, + "learning_rate": 1.5330274121482912e-05, + "loss": 0.4307, + "step": 5742, + "task_loss": 0.759330153465271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6499356627464294, + "epoch": 4.85, + "learning_rate": 1.5324236203357083e-05, + "loss": 0.5415, + "step": 5743, + "task_loss": 0.21940068900585175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9460481405258179, + "epoch": 4.85, + "learning_rate": 1.5318198285231253e-05, + "loss": 0.5853, + "step": 5744, + "task_loss": 1.0863806009292603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3309619426727295, + "epoch": 4.86, + "learning_rate": 1.531216036710542e-05, + "loss": 0.5056, + "step": 5745, + "task_loss": 0.7600712776184082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4350525140762329, + "epoch": 4.86, + "learning_rate": 1.5306122448979594e-05, + "loss": 0.4527, + "step": 5746, + "task_loss": 1.0556528568267822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34537726640701294, + "epoch": 4.86, + "learning_rate": 1.530008453085376e-05, + "loss": 0.5393, + "step": 5747, + "task_loss": 0.7721340656280518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5988664031028748, + "epoch": 4.86, + "learning_rate": 1.5294046612727932e-05, + "loss": 0.5007, + "step": 5748, + "task_loss": 0.9986903667449951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7229220867156982, + "epoch": 4.86, + "learning_rate": 1.5288008694602103e-05, + "loss": 0.6758, + "step": 5749, + "task_loss": 1.260180950164795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4934806227684021, + "epoch": 4.86, + "learning_rate": 1.528197077647627e-05, + "loss": 0.5332, + "step": 5750, + "task_loss": 0.5637091398239136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6755383014678955, + "epoch": 4.86, + "learning_rate": 1.527593285835044e-05, + "loss": 0.6149, + "step": 5751, + "task_loss": 0.5080583095550537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4170818328857422, + "epoch": 4.86, + "learning_rate": 1.526989494022461e-05, + "loss": 0.6308, + "step": 5752, + "task_loss": 1.5215613842010498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35623130202293396, + "epoch": 4.86, + "learning_rate": 1.526385702209878e-05, + "loss": 0.4771, + "step": 5753, + "task_loss": 0.24386626482009888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4084607660770416, + "epoch": 4.86, + "learning_rate": 1.525781910397295e-05, + "loss": 0.4323, + "step": 5754, + "task_loss": 0.7456134557723999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4132385551929474, + "epoch": 4.86, + "learning_rate": 1.525178118584712e-05, + "loss": 0.4349, + "step": 5755, + "task_loss": 1.4611207246780396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46655547618865967, + "epoch": 4.87, + "learning_rate": 1.524574326772129e-05, + "loss": 0.4404, + "step": 5756, + "task_loss": 0.8669162392616272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5696276426315308, + "epoch": 4.87, + "learning_rate": 1.5239705349595459e-05, + "loss": 0.369, + "step": 5757, + "task_loss": 0.17867420613765717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40233302116394043, + "epoch": 4.87, + "learning_rate": 1.5233667431469631e-05, + "loss": 0.4699, + "step": 5758, + "task_loss": 0.7311951518058777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7507137060165405, + "epoch": 4.87, + "learning_rate": 1.52276295133438e-05, + "loss": 0.5608, + "step": 5759, + "task_loss": 1.178305745124817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3480607271194458, + "epoch": 4.87, + "learning_rate": 1.5221591595217969e-05, + "loss": 0.5158, + "step": 5760, + "task_loss": 0.13337811827659607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5539226531982422, + "epoch": 4.87, + "learning_rate": 1.521555367709214e-05, + "loss": 0.6271, + "step": 5761, + "task_loss": 0.17125360667705536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3317650258541107, + "epoch": 4.87, + "learning_rate": 1.5209515758966308e-05, + "loss": 0.3665, + "step": 5762, + "task_loss": 0.22879157960414886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45853376388549805, + "epoch": 4.87, + "learning_rate": 1.520347784084048e-05, + "loss": 0.4243, + "step": 5763, + "task_loss": 0.22903874516487122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.539421796798706, + "epoch": 4.87, + "learning_rate": 1.5197439922714648e-05, + "loss": 0.5323, + "step": 5764, + "task_loss": 0.7340229153633118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48800137639045715, + "epoch": 4.87, + "learning_rate": 1.5191402004588817e-05, + "loss": 0.4599, + "step": 5765, + "task_loss": 0.4793279469013214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5754932761192322, + "epoch": 4.87, + "learning_rate": 1.5185364086462989e-05, + "loss": 0.4746, + "step": 5766, + "task_loss": 1.3344985246658325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4941015839576721, + "epoch": 4.87, + "learning_rate": 1.5179326168337158e-05, + "loss": 0.4667, + "step": 5767, + "task_loss": 0.7423571348190308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5405023694038391, + "epoch": 4.88, + "learning_rate": 1.5173288250211328e-05, + "loss": 0.5102, + "step": 5768, + "task_loss": 1.2611517906188965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4547557830810547, + "epoch": 4.88, + "learning_rate": 1.5167250332085497e-05, + "loss": 0.572, + "step": 5769, + "task_loss": 0.2531443238258362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44711682200431824, + "epoch": 4.88, + "learning_rate": 1.5161212413959666e-05, + "loss": 0.452, + "step": 5770, + "task_loss": 0.41396364569664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28641533851623535, + "epoch": 4.88, + "learning_rate": 1.5155174495833838e-05, + "loss": 0.4711, + "step": 5771, + "task_loss": 0.7494364380836487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5031849145889282, + "epoch": 4.88, + "learning_rate": 1.5149136577708006e-05, + "loss": 0.4776, + "step": 5772, + "task_loss": 1.0330506563186646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5102938413619995, + "epoch": 4.88, + "learning_rate": 1.5143098659582178e-05, + "loss": 0.5076, + "step": 5773, + "task_loss": 0.7059791088104248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4198932647705078, + "epoch": 4.88, + "learning_rate": 1.5137060741456347e-05, + "loss": 0.4032, + "step": 5774, + "task_loss": 0.5342229604721069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.283042848110199, + "epoch": 4.88, + "learning_rate": 1.5131022823330516e-05, + "loss": 0.3216, + "step": 5775, + "task_loss": 0.49188709259033203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3742866814136505, + "epoch": 4.88, + "learning_rate": 1.5124984905204686e-05, + "loss": 0.3684, + "step": 5776, + "task_loss": 0.9383265376091003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3816741704940796, + "epoch": 4.88, + "learning_rate": 1.5118946987078855e-05, + "loss": 0.4913, + "step": 5777, + "task_loss": 0.5717141628265381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5413243174552917, + "epoch": 4.88, + "learning_rate": 1.5112909068953027e-05, + "loss": 0.4815, + "step": 5778, + "task_loss": 1.2612712383270264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5110476016998291, + "epoch": 4.88, + "learning_rate": 1.5106871150827196e-05, + "loss": 0.5065, + "step": 5779, + "task_loss": 0.7782415747642517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5100208520889282, + "epoch": 4.89, + "learning_rate": 1.5100833232701363e-05, + "loss": 0.3971, + "step": 5780, + "task_loss": 1.1203153133392334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5307128429412842, + "epoch": 4.89, + "learning_rate": 1.5094795314575536e-05, + "loss": 0.4121, + "step": 5781, + "task_loss": 1.0690267086029053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48200416564941406, + "epoch": 4.89, + "learning_rate": 1.5088757396449705e-05, + "loss": 0.533, + "step": 5782, + "task_loss": 0.43080538511276245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3921981155872345, + "epoch": 4.89, + "learning_rate": 1.5082719478323875e-05, + "loss": 0.4993, + "step": 5783, + "task_loss": 0.9054892659187317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48124080896377563, + "epoch": 4.89, + "learning_rate": 1.5076681560198044e-05, + "loss": 0.3361, + "step": 5784, + "task_loss": 0.9808400273323059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.62503981590271, + "epoch": 4.89, + "learning_rate": 1.5070643642072213e-05, + "loss": 0.4855, + "step": 5785, + "task_loss": 1.0295519828796387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.832899808883667, + "epoch": 4.89, + "learning_rate": 1.5064605723946385e-05, + "loss": 0.5432, + "step": 5786, + "task_loss": 0.9598299264907837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.1975598335266113, + "epoch": 4.89, + "learning_rate": 1.5058567805820554e-05, + "loss": 0.6281, + "step": 5787, + "task_loss": 0.6961100697517395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30831629037857056, + "epoch": 4.89, + "learning_rate": 1.5052529887694725e-05, + "loss": 0.5169, + "step": 5788, + "task_loss": 0.7045948505401611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48360204696655273, + "epoch": 4.89, + "learning_rate": 1.5046491969568893e-05, + "loss": 0.4486, + "step": 5789, + "task_loss": 0.598354697227478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3582944869995117, + "epoch": 4.89, + "learning_rate": 1.5040454051443062e-05, + "loss": 0.5294, + "step": 5790, + "task_loss": 0.5948873162269592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3131830394268036, + "epoch": 4.89, + "learning_rate": 1.5034416133317233e-05, + "loss": 0.3463, + "step": 5791, + "task_loss": 0.2515471577644348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4120029807090759, + "epoch": 4.9, + "learning_rate": 1.5028378215191402e-05, + "loss": 0.3721, + "step": 5792, + "task_loss": 0.18850304186344147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24460455775260925, + "epoch": 4.9, + "learning_rate": 1.5022340297065574e-05, + "loss": 0.4778, + "step": 5793, + "task_loss": 1.0462384223937988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41444817185401917, + "epoch": 4.9, + "learning_rate": 1.5016302378939743e-05, + "loss": 0.5523, + "step": 5794, + "task_loss": 1.0514357089996338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6986926198005676, + "epoch": 4.9, + "learning_rate": 1.501026446081391e-05, + "loss": 0.4164, + "step": 5795, + "task_loss": 0.21206903457641602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46723219752311707, + "epoch": 4.9, + "learning_rate": 1.5004226542688082e-05, + "loss": 0.4921, + "step": 5796, + "task_loss": 0.3758774697780609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9450217485427856, + "epoch": 4.9, + "learning_rate": 1.4998188624562251e-05, + "loss": 0.5995, + "step": 5797, + "task_loss": 0.6682865619659424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40183451771736145, + "epoch": 4.9, + "learning_rate": 1.4992150706436422e-05, + "loss": 0.5237, + "step": 5798, + "task_loss": 0.8499565124511719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39154326915740967, + "epoch": 4.9, + "learning_rate": 1.498611278831059e-05, + "loss": 0.4685, + "step": 5799, + "task_loss": 0.4589754045009613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4009566307067871, + "epoch": 4.9, + "learning_rate": 1.498007487018476e-05, + "loss": 0.4869, + "step": 5800, + "task_loss": 0.5028212666511536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6072397232055664, + "epoch": 4.9, + "learning_rate": 1.4974036952058932e-05, + "loss": 0.5621, + "step": 5801, + "task_loss": 1.2244374752044678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.706192672252655, + "epoch": 4.9, + "learning_rate": 1.49679990339331e-05, + "loss": 0.4988, + "step": 5802, + "task_loss": 1.1101514101028442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36713117361068726, + "epoch": 4.9, + "learning_rate": 1.4961961115807271e-05, + "loss": 0.4911, + "step": 5803, + "task_loss": 0.5822126865386963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42061156034469604, + "epoch": 4.91, + "learning_rate": 1.495592319768144e-05, + "loss": 0.6164, + "step": 5804, + "task_loss": 0.8172554969787598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7587139010429382, + "epoch": 4.91, + "learning_rate": 1.4949885279555609e-05, + "loss": 0.4922, + "step": 5805, + "task_loss": 0.596482515335083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4406343698501587, + "epoch": 4.91, + "learning_rate": 1.494384736142978e-05, + "loss": 0.4552, + "step": 5806, + "task_loss": 0.7844243049621582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2350749969482422, + "epoch": 4.91, + "learning_rate": 1.4937809443303948e-05, + "loss": 0.3486, + "step": 5807, + "task_loss": 0.4691348373889923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2896985113620758, + "epoch": 4.91, + "learning_rate": 1.493177152517812e-05, + "loss": 0.6549, + "step": 5808, + "task_loss": 0.35225725173950195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6432271003723145, + "epoch": 4.91, + "learning_rate": 1.492573360705229e-05, + "loss": 0.4885, + "step": 5809, + "task_loss": 1.1570217609405518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3027561902999878, + "epoch": 4.91, + "learning_rate": 1.4919695688926458e-05, + "loss": 0.4474, + "step": 5810, + "task_loss": 1.0662912130355835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40217119455337524, + "epoch": 4.91, + "learning_rate": 1.4913657770800629e-05, + "loss": 0.5823, + "step": 5811, + "task_loss": 0.7301077842712402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47895097732543945, + "epoch": 4.91, + "learning_rate": 1.4907619852674798e-05, + "loss": 0.452, + "step": 5812, + "task_loss": 0.3648097813129425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31992220878601074, + "epoch": 4.91, + "learning_rate": 1.4901581934548968e-05, + "loss": 0.4001, + "step": 5813, + "task_loss": 0.7621862888336182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5665308833122253, + "epoch": 4.91, + "learning_rate": 1.4895544016423137e-05, + "loss": 0.5896, + "step": 5814, + "task_loss": 0.5625012516975403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41491222381591797, + "epoch": 4.91, + "learning_rate": 1.4889506098297306e-05, + "loss": 0.4595, + "step": 5815, + "task_loss": 0.9698602557182312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3085525333881378, + "epoch": 4.92, + "learning_rate": 1.4883468180171479e-05, + "loss": 0.4151, + "step": 5816, + "task_loss": 0.9177283048629761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3345073461532593, + "epoch": 4.92, + "learning_rate": 1.4877430262045647e-05, + "loss": 0.3756, + "step": 5817, + "task_loss": 0.6464024186134338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3889596462249756, + "epoch": 4.92, + "learning_rate": 1.4871392343919818e-05, + "loss": 0.4763, + "step": 5818, + "task_loss": 0.5566734671592712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5989587903022766, + "epoch": 4.92, + "learning_rate": 1.4865354425793987e-05, + "loss": 0.4554, + "step": 5819, + "task_loss": 0.12948866188526154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3671024739742279, + "epoch": 4.92, + "learning_rate": 1.4859316507668156e-05, + "loss": 0.5576, + "step": 5820, + "task_loss": 0.43852099776268005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5567774772644043, + "epoch": 4.92, + "learning_rate": 1.4853278589542326e-05, + "loss": 0.471, + "step": 5821, + "task_loss": 1.3301247358322144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34113407135009766, + "epoch": 4.92, + "learning_rate": 1.4847240671416495e-05, + "loss": 0.458, + "step": 5822, + "task_loss": 0.978023111820221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7804850935935974, + "epoch": 4.92, + "learning_rate": 1.4841202753290667e-05, + "loss": 0.5817, + "step": 5823, + "task_loss": 1.078391671180725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6289469003677368, + "epoch": 4.92, + "learning_rate": 1.4835164835164836e-05, + "loss": 0.4885, + "step": 5824, + "task_loss": 0.5521443486213684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4120987355709076, + "epoch": 4.92, + "learning_rate": 1.4829126917039005e-05, + "loss": 0.4166, + "step": 5825, + "task_loss": 0.27023211121559143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7844724655151367, + "epoch": 4.92, + "learning_rate": 1.4823088998913176e-05, + "loss": 0.5908, + "step": 5826, + "task_loss": 1.4410432577133179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30639687180519104, + "epoch": 4.93, + "learning_rate": 1.4817051080787345e-05, + "loss": 0.4905, + "step": 5827, + "task_loss": 0.437218576669693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2257484644651413, + "epoch": 4.93, + "learning_rate": 1.4811013162661517e-05, + "loss": 0.3364, + "step": 5828, + "task_loss": 0.17178605496883392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38050517439842224, + "epoch": 4.93, + "learning_rate": 1.4804975244535684e-05, + "loss": 0.5635, + "step": 5829, + "task_loss": 1.1554322242736816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6037095189094543, + "epoch": 4.93, + "learning_rate": 1.4798937326409853e-05, + "loss": 0.4597, + "step": 5830, + "task_loss": 0.19461466372013092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6499226093292236, + "epoch": 4.93, + "learning_rate": 1.4792899408284025e-05, + "loss": 0.5116, + "step": 5831, + "task_loss": 0.6344338655471802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6488349437713623, + "epoch": 4.93, + "learning_rate": 1.4786861490158194e-05, + "loss": 0.4408, + "step": 5832, + "task_loss": 0.9087602496147156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4854084551334381, + "epoch": 4.93, + "learning_rate": 1.4780823572032365e-05, + "loss": 0.4642, + "step": 5833, + "task_loss": 0.3827601671218872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3597487807273865, + "epoch": 4.93, + "learning_rate": 1.4774785653906534e-05, + "loss": 0.4407, + "step": 5834, + "task_loss": 0.28483617305755615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41044533252716064, + "epoch": 4.93, + "learning_rate": 1.4768747735780702e-05, + "loss": 0.4847, + "step": 5835, + "task_loss": 0.532178521156311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5367692708969116, + "epoch": 4.93, + "learning_rate": 1.4762709817654875e-05, + "loss": 0.4745, + "step": 5836, + "task_loss": 0.9414312243461609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32648587226867676, + "epoch": 4.93, + "learning_rate": 1.4756671899529042e-05, + "loss": 0.4396, + "step": 5837, + "task_loss": 0.48526817560195923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5288338661193848, + "epoch": 4.93, + "learning_rate": 1.4750633981403214e-05, + "loss": 0.5427, + "step": 5838, + "task_loss": 1.1049845218658447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3779226243495941, + "epoch": 4.94, + "learning_rate": 1.4744596063277383e-05, + "loss": 0.4833, + "step": 5839, + "task_loss": 0.2531374990940094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30419260263442993, + "epoch": 4.94, + "learning_rate": 1.4738558145151552e-05, + "loss": 0.344, + "step": 5840, + "task_loss": 0.20573073625564575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47613024711608887, + "epoch": 4.94, + "learning_rate": 1.4732520227025722e-05, + "loss": 0.4087, + "step": 5841, + "task_loss": 0.5432329177856445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4031655490398407, + "epoch": 4.94, + "learning_rate": 1.4726482308899891e-05, + "loss": 0.4612, + "step": 5842, + "task_loss": 0.26833173632621765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4328988194465637, + "epoch": 4.94, + "learning_rate": 1.4720444390774064e-05, + "loss": 0.5883, + "step": 5843, + "task_loss": 0.5638561844825745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6232972145080566, + "epoch": 4.94, + "learning_rate": 1.4714406472648232e-05, + "loss": 0.5523, + "step": 5844, + "task_loss": 0.9876930713653564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2712046802043915, + "epoch": 4.94, + "learning_rate": 1.47083685545224e-05, + "loss": 0.3715, + "step": 5845, + "task_loss": 0.03049248829483986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5060746073722839, + "epoch": 4.94, + "learning_rate": 1.4702330636396572e-05, + "loss": 0.4787, + "step": 5846, + "task_loss": 0.6439326405525208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3097953200340271, + "epoch": 4.94, + "learning_rate": 1.469629271827074e-05, + "loss": 0.449, + "step": 5847, + "task_loss": 0.16452401876449585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8650487661361694, + "epoch": 4.94, + "learning_rate": 1.4690254800144911e-05, + "loss": 0.5764, + "step": 5848, + "task_loss": 1.339842438697815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6167279481887817, + "epoch": 4.94, + "learning_rate": 1.468421688201908e-05, + "loss": 0.5044, + "step": 5849, + "task_loss": 0.6929930448532104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3980950117111206, + "epoch": 4.94, + "learning_rate": 1.4678178963893249e-05, + "loss": 0.5074, + "step": 5850, + "task_loss": 0.8631390333175659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3490990996360779, + "epoch": 4.95, + "learning_rate": 1.4672141045767421e-05, + "loss": 0.4935, + "step": 5851, + "task_loss": 0.270679235458374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3136811852455139, + "epoch": 4.95, + "learning_rate": 1.466610312764159e-05, + "loss": 0.6327, + "step": 5852, + "task_loss": 1.1166059970855713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3603687882423401, + "epoch": 4.95, + "learning_rate": 1.466006520951576e-05, + "loss": 0.4278, + "step": 5853, + "task_loss": 0.2023213654756546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25370514392852783, + "epoch": 4.95, + "learning_rate": 1.465402729138993e-05, + "loss": 0.5277, + "step": 5854, + "task_loss": 0.7227314114570618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3278053402900696, + "epoch": 4.95, + "learning_rate": 1.4647989373264099e-05, + "loss": 0.5141, + "step": 5855, + "task_loss": 0.2890128195285797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6116062998771667, + "epoch": 4.95, + "learning_rate": 1.464195145513827e-05, + "loss": 0.5481, + "step": 5856, + "task_loss": 0.4133390486240387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5500404834747314, + "epoch": 4.95, + "learning_rate": 1.4635913537012438e-05, + "loss": 0.4849, + "step": 5857, + "task_loss": 0.6949512958526611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5884612798690796, + "epoch": 4.95, + "learning_rate": 1.462987561888661e-05, + "loss": 0.512, + "step": 5858, + "task_loss": 1.1272770166397095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4891970753669739, + "epoch": 4.95, + "learning_rate": 1.462383770076078e-05, + "loss": 0.6648, + "step": 5859, + "task_loss": 0.7570291757583618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3876471221446991, + "epoch": 4.95, + "learning_rate": 1.4617799782634946e-05, + "loss": 0.3473, + "step": 5860, + "task_loss": 0.3063424825668335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3969310522079468, + "epoch": 4.95, + "learning_rate": 1.4611761864509119e-05, + "loss": 0.6064, + "step": 5861, + "task_loss": 0.6238088607788086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29979634284973145, + "epoch": 4.95, + "learning_rate": 1.4605723946383288e-05, + "loss": 0.5242, + "step": 5862, + "task_loss": 0.06515660136938095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37218183279037476, + "epoch": 4.96, + "learning_rate": 1.4599686028257458e-05, + "loss": 0.4648, + "step": 5863, + "task_loss": 0.152825266122818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2802339494228363, + "epoch": 4.96, + "learning_rate": 1.4593648110131627e-05, + "loss": 0.5194, + "step": 5864, + "task_loss": 0.3493371307849884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4369129240512848, + "epoch": 4.96, + "learning_rate": 1.4587610192005796e-05, + "loss": 0.601, + "step": 5865, + "task_loss": 1.2810916900634766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4378892183303833, + "epoch": 4.96, + "learning_rate": 1.4581572273879968e-05, + "loss": 0.4517, + "step": 5866, + "task_loss": 0.4537532925605774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4011518955230713, + "epoch": 4.96, + "learning_rate": 1.4575534355754137e-05, + "loss": 0.5105, + "step": 5867, + "task_loss": 1.0126134157180786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4999954104423523, + "epoch": 4.96, + "learning_rate": 1.4569496437628308e-05, + "loss": 0.5696, + "step": 5868, + "task_loss": 0.691393256187439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5981533527374268, + "epoch": 4.96, + "learning_rate": 1.4563458519502476e-05, + "loss": 0.547, + "step": 5869, + "task_loss": 0.8819162845611572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3164193630218506, + "epoch": 4.96, + "learning_rate": 1.4557420601376645e-05, + "loss": 0.4389, + "step": 5870, + "task_loss": 1.1230006217956543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40841490030288696, + "epoch": 4.96, + "learning_rate": 1.4551382683250816e-05, + "loss": 0.3507, + "step": 5871, + "task_loss": 0.15326306223869324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5229266881942749, + "epoch": 4.96, + "learning_rate": 1.4545344765124985e-05, + "loss": 0.5676, + "step": 5872, + "task_loss": 0.5109989047050476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6146092414855957, + "epoch": 4.96, + "learning_rate": 1.4539306846999157e-05, + "loss": 0.3857, + "step": 5873, + "task_loss": 0.47085052728652954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31163936853408813, + "epoch": 4.96, + "learning_rate": 1.4533268928873326e-05, + "loss": 0.4197, + "step": 5874, + "task_loss": 0.30774977803230286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2735847234725952, + "epoch": 4.97, + "learning_rate": 1.4527231010747495e-05, + "loss": 0.4672, + "step": 5875, + "task_loss": 0.6831485629081726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5137497186660767, + "epoch": 4.97, + "learning_rate": 1.4521193092621665e-05, + "loss": 0.4959, + "step": 5876, + "task_loss": 0.1318991482257843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24189987778663635, + "epoch": 4.97, + "learning_rate": 1.4515155174495834e-05, + "loss": 0.4358, + "step": 5877, + "task_loss": 0.8447002172470093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2813869118690491, + "epoch": 4.97, + "learning_rate": 1.4509117256370005e-05, + "loss": 0.5095, + "step": 5878, + "task_loss": 0.4457683265209198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8438689708709717, + "epoch": 4.97, + "learning_rate": 1.4503079338244174e-05, + "loss": 0.763, + "step": 5879, + "task_loss": 1.0650538206100464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3680497407913208, + "epoch": 4.97, + "learning_rate": 1.4497041420118343e-05, + "loss": 0.4162, + "step": 5880, + "task_loss": 0.3631020486354828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6185603737831116, + "epoch": 4.97, + "learning_rate": 1.4491003501992515e-05, + "loss": 0.6073, + "step": 5881, + "task_loss": 1.1768121719360352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3132491111755371, + "epoch": 4.97, + "learning_rate": 1.4484965583866684e-05, + "loss": 0.4332, + "step": 5882, + "task_loss": 0.6072292327880859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.451108455657959, + "epoch": 4.97, + "learning_rate": 1.4478927665740854e-05, + "loss": 0.4479, + "step": 5883, + "task_loss": 0.9165635704994202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5627396106719971, + "epoch": 4.97, + "learning_rate": 1.4472889747615023e-05, + "loss": 0.572, + "step": 5884, + "task_loss": 0.39646631479263306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8555253744125366, + "epoch": 4.97, + "learning_rate": 1.4466851829489192e-05, + "loss": 0.5001, + "step": 5885, + "task_loss": 0.6878842711448669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5245431661605835, + "epoch": 4.97, + "learning_rate": 1.4460813911363363e-05, + "loss": 0.4436, + "step": 5886, + "task_loss": 0.6413938403129578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19140513241291046, + "epoch": 4.98, + "learning_rate": 1.4454775993237531e-05, + "loss": 0.2676, + "step": 5887, + "task_loss": 0.13698330521583557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.957083523273468, + "epoch": 4.98, + "learning_rate": 1.4448738075111704e-05, + "loss": 0.5191, + "step": 5888, + "task_loss": 0.8026174306869507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.264891654253006, + "epoch": 4.98, + "learning_rate": 1.4442700156985873e-05, + "loss": 0.3094, + "step": 5889, + "task_loss": 0.30313175916671753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5777595043182373, + "epoch": 4.98, + "learning_rate": 1.4436662238860041e-05, + "loss": 0.4691, + "step": 5890, + "task_loss": 0.6727564334869385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.13674266636371613, + "epoch": 4.98, + "learning_rate": 1.4430624320734212e-05, + "loss": 0.379, + "step": 5891, + "task_loss": 0.27789682149887085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5818727612495422, + "epoch": 4.98, + "learning_rate": 1.4424586402608381e-05, + "loss": 0.6628, + "step": 5892, + "task_loss": 0.6469494700431824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5610709190368652, + "epoch": 4.98, + "learning_rate": 1.4418548484482553e-05, + "loss": 0.5601, + "step": 5893, + "task_loss": 0.2761783301830292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6625365018844604, + "epoch": 4.98, + "learning_rate": 1.441251056635672e-05, + "loss": 0.4939, + "step": 5894, + "task_loss": 1.0583909749984741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.581427276134491, + "epoch": 4.98, + "learning_rate": 1.440647264823089e-05, + "loss": 0.5764, + "step": 5895, + "task_loss": 0.6895261406898499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6571460366249084, + "epoch": 4.98, + "learning_rate": 1.4400434730105062e-05, + "loss": 0.5062, + "step": 5896, + "task_loss": 0.74186110496521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4037855267524719, + "epoch": 4.98, + "learning_rate": 1.439439681197923e-05, + "loss": 0.444, + "step": 5897, + "task_loss": 0.5062740445137024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28357407450675964, + "epoch": 4.99, + "learning_rate": 1.4388358893853401e-05, + "loss": 0.3359, + "step": 5898, + "task_loss": 0.23819175362586975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4217330813407898, + "epoch": 4.99, + "learning_rate": 1.438232097572757e-05, + "loss": 0.4246, + "step": 5899, + "task_loss": 0.6964184641838074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9314852356910706, + "epoch": 4.99, + "learning_rate": 1.4376283057601739e-05, + "loss": 0.6022, + "step": 5900, + "task_loss": 0.9725218415260315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38695868849754333, + "epoch": 4.99, + "learning_rate": 1.4370245139475911e-05, + "loss": 0.3766, + "step": 5901, + "task_loss": 0.6227385401725769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6097522974014282, + "epoch": 4.99, + "learning_rate": 1.4364207221350078e-05, + "loss": 0.5886, + "step": 5902, + "task_loss": 0.8588147759437561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8629091382026672, + "epoch": 4.99, + "learning_rate": 1.4358169303224247e-05, + "loss": 0.6821, + "step": 5903, + "task_loss": 0.5130578279495239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5736752152442932, + "epoch": 4.99, + "learning_rate": 1.435213138509842e-05, + "loss": 0.4862, + "step": 5904, + "task_loss": 0.5569455027580261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.645703911781311, + "epoch": 4.99, + "learning_rate": 1.4346093466972588e-05, + "loss": 0.5298, + "step": 5905, + "task_loss": 0.5157227516174316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37729692459106445, + "epoch": 4.99, + "learning_rate": 1.4340055548846759e-05, + "loss": 0.4047, + "step": 5906, + "task_loss": 0.5590080618858337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5275990962982178, + "epoch": 4.99, + "learning_rate": 1.4334017630720928e-05, + "loss": 0.6381, + "step": 5907, + "task_loss": 0.979600191116333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5278711318969727, + "epoch": 4.99, + "learning_rate": 1.4327979712595097e-05, + "loss": 0.4737, + "step": 5908, + "task_loss": 0.5000836253166199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37122756242752075, + "epoch": 4.99, + "learning_rate": 1.4321941794469269e-05, + "loss": 0.5042, + "step": 5909, + "task_loss": 1.2997733354568481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6282119750976562, + "epoch": 5.0, + "learning_rate": 1.4315903876343436e-05, + "loss": 0.5548, + "step": 5910, + "task_loss": 0.7074734568595886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4617100954055786, + "epoch": 5.0, + "learning_rate": 1.4309865958217608e-05, + "loss": 0.3705, + "step": 5911, + "task_loss": 0.8815626502037048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45544254779815674, + "epoch": 5.0, + "learning_rate": 1.4303828040091777e-05, + "loss": 0.5654, + "step": 5912, + "task_loss": 0.22035664319992065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.600818395614624, + "epoch": 5.0, + "learning_rate": 1.4297790121965946e-05, + "loss": 0.5078, + "step": 5913, + "task_loss": 0.3404569923877716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8749569654464722, + "epoch": 5.0, + "learning_rate": 1.4291752203840117e-05, + "loss": 0.534, + "step": 5914, + "task_loss": 0.820824384689331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.252996027469635, + "epoch": 5.0, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.4369, + "step": 5915, + "task_loss": 0.5022522807121277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3851630389690399, + "epoch": 5.0, + "learning_rate": 1.4279676367588458e-05, + "loss": 0.8312, + "step": 5916, + "task_loss": 0.5981258749961853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5213533639907837, + "epoch": 5.0, + "learning_rate": 1.4273638449462627e-05, + "loss": 0.4492, + "step": 5917, + "task_loss": 0.5999496579170227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7086391448974609, + "epoch": 5.0, + "learning_rate": 1.4267600531336794e-05, + "loss": 0.5934, + "step": 5918, + "task_loss": 0.9338732361793518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5655124187469482, + "epoch": 5.0, + "learning_rate": 1.4261562613210966e-05, + "loss": 0.4947, + "step": 5919, + "task_loss": 0.6104177832603455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.824927806854248, + "epoch": 5.0, + "learning_rate": 1.4255524695085135e-05, + "loss": 0.5232, + "step": 5920, + "task_loss": 0.8307041525840759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7413452863693237, + "epoch": 5.01, + "learning_rate": 1.4249486776959305e-05, + "loss": 0.5431, + "step": 5921, + "task_loss": 0.5468769669532776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39320409297943115, + "epoch": 5.01, + "learning_rate": 1.4243448858833474e-05, + "loss": 0.4708, + "step": 5922, + "task_loss": 0.7387615442276001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.79927659034729, + "epoch": 5.01, + "learning_rate": 1.4237410940707643e-05, + "loss": 0.6666, + "step": 5923, + "task_loss": 0.6731265187263489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36455684900283813, + "epoch": 5.01, + "learning_rate": 1.4231373022581815e-05, + "loss": 0.4146, + "step": 5924, + "task_loss": 0.5640330910682678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5359421968460083, + "epoch": 5.01, + "learning_rate": 1.4225335104455983e-05, + "loss": 0.4275, + "step": 5925, + "task_loss": 0.505700409412384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17871074378490448, + "epoch": 5.01, + "learning_rate": 1.4219297186330155e-05, + "loss": 0.4018, + "step": 5926, + "task_loss": 0.8501909375190735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4953292906284332, + "epoch": 5.01, + "learning_rate": 1.4213259268204324e-05, + "loss": 0.4543, + "step": 5927, + "task_loss": 1.4153022766113281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4085580110549927, + "epoch": 5.01, + "learning_rate": 1.4207221350078493e-05, + "loss": 0.4821, + "step": 5928, + "task_loss": 0.08659082651138306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5490870475769043, + "epoch": 5.01, + "learning_rate": 1.4201183431952663e-05, + "loss": 0.4103, + "step": 5929, + "task_loss": 0.20959487557411194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4751938581466675, + "epoch": 5.01, + "learning_rate": 1.4195145513826832e-05, + "loss": 0.4259, + "step": 5930, + "task_loss": 0.21572832763195038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5338667035102844, + "epoch": 5.01, + "learning_rate": 1.4189107595701004e-05, + "loss": 0.5098, + "step": 5931, + "task_loss": 1.6079308986663818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25754159688949585, + "epoch": 5.01, + "learning_rate": 1.4183069677575173e-05, + "loss": 0.3872, + "step": 5932, + "task_loss": 0.5583166480064392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3172677755355835, + "epoch": 5.02, + "learning_rate": 1.417703175944934e-05, + "loss": 0.3944, + "step": 5933, + "task_loss": 1.0343488454818726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34228354692459106, + "epoch": 5.02, + "learning_rate": 1.4170993841323513e-05, + "loss": 0.4722, + "step": 5934, + "task_loss": 0.49838778376579285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4840555489063263, + "epoch": 5.02, + "learning_rate": 1.4164955923197682e-05, + "loss": 0.4237, + "step": 5935, + "task_loss": 0.5111437439918518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3206411302089691, + "epoch": 5.02, + "learning_rate": 1.4158918005071852e-05, + "loss": 0.4337, + "step": 5936, + "task_loss": 0.1432463526725769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4909075200557709, + "epoch": 5.02, + "learning_rate": 1.4152880086946021e-05, + "loss": 0.4344, + "step": 5937, + "task_loss": 1.1542472839355469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4109368622303009, + "epoch": 5.02, + "learning_rate": 1.414684216882019e-05, + "loss": 0.5097, + "step": 5938, + "task_loss": 0.29364532232284546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7045813798904419, + "epoch": 5.02, + "learning_rate": 1.4140804250694362e-05, + "loss": 0.5035, + "step": 5939, + "task_loss": 1.4589147567749023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5719457864761353, + "epoch": 5.02, + "learning_rate": 1.4134766332568531e-05, + "loss": 0.5066, + "step": 5940, + "task_loss": 0.5735933184623718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6096122860908508, + "epoch": 5.02, + "learning_rate": 1.4128728414442702e-05, + "loss": 0.4796, + "step": 5941, + "task_loss": 1.2086384296417236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6042701005935669, + "epoch": 5.02, + "learning_rate": 1.412269049631687e-05, + "loss": 0.4657, + "step": 5942, + "task_loss": 0.4611886441707611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5211130380630493, + "epoch": 5.02, + "learning_rate": 1.411665257819104e-05, + "loss": 0.5548, + "step": 5943, + "task_loss": 0.8150731325149536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.285304456949234, + "epoch": 5.02, + "learning_rate": 1.411061466006521e-05, + "loss": 0.4676, + "step": 5944, + "task_loss": 0.6937703490257263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6477197408676147, + "epoch": 5.03, + "learning_rate": 1.4104576741939379e-05, + "loss": 0.4886, + "step": 5945, + "task_loss": 0.523460865020752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.638511061668396, + "epoch": 5.03, + "learning_rate": 1.4098538823813551e-05, + "loss": 0.3574, + "step": 5946, + "task_loss": 0.2648000121116638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3840251863002777, + "epoch": 5.03, + "learning_rate": 1.409250090568772e-05, + "loss": 0.6735, + "step": 5947, + "task_loss": 0.5291072726249695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36865752935409546, + "epoch": 5.03, + "learning_rate": 1.4086462987561889e-05, + "loss": 0.4441, + "step": 5948, + "task_loss": 1.163393259048462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40287458896636963, + "epoch": 5.03, + "learning_rate": 1.408042506943606e-05, + "loss": 0.4299, + "step": 5949, + "task_loss": 0.2987561523914337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35082125663757324, + "epoch": 5.03, + "learning_rate": 1.4074387151310228e-05, + "loss": 0.3744, + "step": 5950, + "task_loss": 0.3797934353351593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38333818316459656, + "epoch": 5.03, + "learning_rate": 1.4068349233184399e-05, + "loss": 0.5009, + "step": 5951, + "task_loss": 0.8500578999519348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7343951463699341, + "epoch": 5.03, + "learning_rate": 1.4062311315058568e-05, + "loss": 0.4746, + "step": 5952, + "task_loss": 0.3409634232521057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43828216195106506, + "epoch": 5.03, + "learning_rate": 1.4056273396932737e-05, + "loss": 0.3911, + "step": 5953, + "task_loss": 1.0307278633117676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3163219690322876, + "epoch": 5.03, + "learning_rate": 1.4050235478806909e-05, + "loss": 0.3713, + "step": 5954, + "task_loss": 0.18032242357730865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8195264339447021, + "epoch": 5.03, + "learning_rate": 1.4044197560681078e-05, + "loss": 0.5284, + "step": 5955, + "task_loss": 1.0049339532852173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3877018094062805, + "epoch": 5.03, + "learning_rate": 1.4038159642555248e-05, + "loss": 0.3698, + "step": 5956, + "task_loss": 0.5924592018127441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2178558111190796, + "epoch": 5.04, + "learning_rate": 1.4032121724429417e-05, + "loss": 0.3679, + "step": 5957, + "task_loss": 0.24024835228919983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7152705192565918, + "epoch": 5.04, + "learning_rate": 1.4026083806303586e-05, + "loss": 0.587, + "step": 5958, + "task_loss": 0.7356069087982178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.56902015209198, + "epoch": 5.04, + "learning_rate": 1.4020045888177757e-05, + "loss": 0.4732, + "step": 5959, + "task_loss": 1.5463863611221313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3452136218547821, + "epoch": 5.04, + "learning_rate": 1.4014007970051926e-05, + "loss": 0.4203, + "step": 5960, + "task_loss": 0.19633322954177856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3555434048175812, + "epoch": 5.04, + "learning_rate": 1.4007970051926098e-05, + "loss": 0.5184, + "step": 5961, + "task_loss": 0.227996826171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4558466076850891, + "epoch": 5.04, + "learning_rate": 1.4001932133800267e-05, + "loss": 0.4376, + "step": 5962, + "task_loss": 0.6436222791671753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49706026911735535, + "epoch": 5.04, + "learning_rate": 1.3995894215674436e-05, + "loss": 0.5675, + "step": 5963, + "task_loss": 1.4766284227371216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3784533441066742, + "epoch": 5.04, + "learning_rate": 1.3989856297548606e-05, + "loss": 0.4673, + "step": 5964, + "task_loss": 0.5140158534049988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4315754473209381, + "epoch": 5.04, + "learning_rate": 1.3983818379422775e-05, + "loss": 0.434, + "step": 5965, + "task_loss": 1.3529188632965088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5177199840545654, + "epoch": 5.04, + "learning_rate": 1.3977780461296947e-05, + "loss": 0.5322, + "step": 5966, + "task_loss": 1.0734010934829712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3252885639667511, + "epoch": 5.04, + "learning_rate": 1.3971742543171114e-05, + "loss": 0.449, + "step": 5967, + "task_loss": 0.7452465891838074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6205657720565796, + "epoch": 5.04, + "learning_rate": 1.3965704625045283e-05, + "loss": 0.5014, + "step": 5968, + "task_loss": 0.6555680632591248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7925713062286377, + "epoch": 5.05, + "learning_rate": 1.3959666706919456e-05, + "loss": 0.6667, + "step": 5969, + "task_loss": 2.157383680343628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3912781774997711, + "epoch": 5.05, + "learning_rate": 1.3953628788793624e-05, + "loss": 0.6124, + "step": 5970, + "task_loss": 0.21668975055217743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.599994421005249, + "epoch": 5.05, + "learning_rate": 1.3947590870667795e-05, + "loss": 0.4523, + "step": 5971, + "task_loss": 1.3879796266555786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40146079659461975, + "epoch": 5.05, + "learning_rate": 1.3941552952541964e-05, + "loss": 0.6216, + "step": 5972, + "task_loss": 0.8643374443054199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2691670060157776, + "epoch": 5.05, + "learning_rate": 1.3935515034416133e-05, + "loss": 0.3521, + "step": 5973, + "task_loss": 0.520159125328064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4834488332271576, + "epoch": 5.05, + "learning_rate": 1.3929477116290305e-05, + "loss": 0.5006, + "step": 5974, + "task_loss": 0.6700630187988281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4914935827255249, + "epoch": 5.05, + "learning_rate": 1.3923439198164472e-05, + "loss": 0.3669, + "step": 5975, + "task_loss": 0.5475313663482666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6089412569999695, + "epoch": 5.05, + "learning_rate": 1.3917401280038645e-05, + "loss": 0.5273, + "step": 5976, + "task_loss": 1.2933344841003418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5726367235183716, + "epoch": 5.05, + "learning_rate": 1.3911363361912813e-05, + "loss": 0.5114, + "step": 5977, + "task_loss": 0.4062042534351349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.372598797082901, + "epoch": 5.05, + "learning_rate": 1.3905325443786982e-05, + "loss": 0.4542, + "step": 5978, + "task_loss": 0.28348684310913086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46051955223083496, + "epoch": 5.05, + "learning_rate": 1.3899287525661153e-05, + "loss": 0.3955, + "step": 5979, + "task_loss": 0.285810649394989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2923417091369629, + "epoch": 5.05, + "learning_rate": 1.3893249607535322e-05, + "loss": 0.4758, + "step": 5980, + "task_loss": 0.1647213101387024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3080926537513733, + "epoch": 5.06, + "learning_rate": 1.3887211689409494e-05, + "loss": 0.465, + "step": 5981, + "task_loss": 1.0512843132019043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49598783254623413, + "epoch": 5.06, + "learning_rate": 1.3881173771283663e-05, + "loss": 0.5469, + "step": 5982, + "task_loss": 1.1917130947113037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5528882145881653, + "epoch": 5.06, + "learning_rate": 1.387513585315783e-05, + "loss": 0.4241, + "step": 5983, + "task_loss": 0.2213740348815918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8582708835601807, + "epoch": 5.06, + "learning_rate": 1.3869097935032002e-05, + "loss": 0.6538, + "step": 5984, + "task_loss": 1.4047415256500244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5671178102493286, + "epoch": 5.06, + "learning_rate": 1.3863060016906171e-05, + "loss": 0.4732, + "step": 5985, + "task_loss": 1.1533161401748657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4082861840724945, + "epoch": 5.06, + "learning_rate": 1.3857022098780342e-05, + "loss": 0.4781, + "step": 5986, + "task_loss": 0.7760971188545227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6113453507423401, + "epoch": 5.06, + "learning_rate": 1.385098418065451e-05, + "loss": 0.5021, + "step": 5987, + "task_loss": 2.1331605911254883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40066906809806824, + "epoch": 5.06, + "learning_rate": 1.384494626252868e-05, + "loss": 0.5455, + "step": 5988, + "task_loss": 0.3311357796192169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34928593039512634, + "epoch": 5.06, + "learning_rate": 1.3838908344402852e-05, + "loss": 0.384, + "step": 5989, + "task_loss": 1.1157867908477783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44127339124679565, + "epoch": 5.06, + "learning_rate": 1.3832870426277019e-05, + "loss": 0.4786, + "step": 5990, + "task_loss": 0.5373550653457642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3964579701423645, + "epoch": 5.06, + "learning_rate": 1.3826832508151191e-05, + "loss": 0.5821, + "step": 5991, + "task_loss": 1.3341269493103027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23927077651023865, + "epoch": 5.07, + "learning_rate": 1.382079459002536e-05, + "loss": 0.3745, + "step": 5992, + "task_loss": 0.14874033629894257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4649468958377838, + "epoch": 5.07, + "learning_rate": 1.3814756671899529e-05, + "loss": 0.5224, + "step": 5993, + "task_loss": 1.2135146856307983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8507091403007507, + "epoch": 5.07, + "learning_rate": 1.38087187537737e-05, + "loss": 0.4885, + "step": 5994, + "task_loss": 1.1142359972000122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41716885566711426, + "epoch": 5.07, + "learning_rate": 1.3802680835647868e-05, + "loss": 0.5136, + "step": 5995, + "task_loss": 0.8956722021102905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3436892032623291, + "epoch": 5.07, + "learning_rate": 1.379664291752204e-05, + "loss": 0.5156, + "step": 5996, + "task_loss": 0.9041117429733276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29305529594421387, + "epoch": 5.07, + "learning_rate": 1.379060499939621e-05, + "loss": 0.3566, + "step": 5997, + "task_loss": 0.10946956276893616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19880545139312744, + "epoch": 5.07, + "learning_rate": 1.3784567081270377e-05, + "loss": 0.3587, + "step": 5998, + "task_loss": 0.05080127343535423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3904602527618408, + "epoch": 5.07, + "learning_rate": 1.3778529163144549e-05, + "loss": 0.3601, + "step": 5999, + "task_loss": 0.6094356775283813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6640608310699463, + "epoch": 5.07, + "learning_rate": 1.3772491245018718e-05, + "loss": 0.5787, + "step": 6000, + "task_loss": 0.18646833300590515 + }, + { + "epoch": 5.07, + "eval_accuracy": 0.9078415841584159, + "eval_loss": 0.30195334553718567, + "eval_runtime": 226.4735, + "eval_samples_per_second": 111.492, + "eval_steps_per_second": 0.874, + "step": 6000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7259561419487, + "epoch": 5.07, + "learning_rate": 1.3766453326892888e-05, + "loss": 0.4455, + "step": 6001, + "task_loss": 0.8431296944618225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3298564553260803, + "epoch": 5.07, + "learning_rate": 1.3760415408767057e-05, + "loss": 0.5435, + "step": 6002, + "task_loss": 0.996370255947113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3912476599216461, + "epoch": 5.07, + "learning_rate": 1.3754377490641226e-05, + "loss": 0.4072, + "step": 6003, + "task_loss": 0.06636767089366913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47756001353263855, + "epoch": 5.08, + "learning_rate": 1.3748339572515398e-05, + "loss": 0.4109, + "step": 6004, + "task_loss": 0.6571682691574097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4718065559864044, + "epoch": 5.08, + "learning_rate": 1.3742301654389567e-05, + "loss": 0.392, + "step": 6005, + "task_loss": 1.1199108362197876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40528351068496704, + "epoch": 5.08, + "learning_rate": 1.3736263736263738e-05, + "loss": 0.3548, + "step": 6006, + "task_loss": 0.42825576663017273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3780808448791504, + "epoch": 5.08, + "learning_rate": 1.3730225818137907e-05, + "loss": 0.3659, + "step": 6007, + "task_loss": 1.0530563592910767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39639055728912354, + "epoch": 5.08, + "learning_rate": 1.3724187900012076e-05, + "loss": 0.4787, + "step": 6008, + "task_loss": 0.4536390006542206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4206335246562958, + "epoch": 5.08, + "learning_rate": 1.3718149981886246e-05, + "loss": 0.3619, + "step": 6009, + "task_loss": 1.1850595474243164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29675641655921936, + "epoch": 5.08, + "learning_rate": 1.3712112063760415e-05, + "loss": 0.5426, + "step": 6010, + "task_loss": 0.16049526631832123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5407566428184509, + "epoch": 5.08, + "learning_rate": 1.3706074145634587e-05, + "loss": 0.4744, + "step": 6011, + "task_loss": 0.933721125125885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.416929692029953, + "epoch": 5.08, + "learning_rate": 1.3700036227508756e-05, + "loss": 0.5826, + "step": 6012, + "task_loss": 0.26832908391952515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5752192735671997, + "epoch": 5.08, + "learning_rate": 1.3693998309382925e-05, + "loss": 0.432, + "step": 6013, + "task_loss": 1.1222081184387207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3737102150917053, + "epoch": 5.08, + "learning_rate": 1.3687960391257096e-05, + "loss": 0.39, + "step": 6014, + "task_loss": 0.08444869518280029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29432401061058044, + "epoch": 5.08, + "learning_rate": 1.3681922473131265e-05, + "loss": 0.3323, + "step": 6015, + "task_loss": 0.2860241234302521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6316185593605042, + "epoch": 5.09, + "learning_rate": 1.3675884555005435e-05, + "loss": 0.3914, + "step": 6016, + "task_loss": 0.6852641105651855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5086910724639893, + "epoch": 5.09, + "learning_rate": 1.3669846636879604e-05, + "loss": 0.5866, + "step": 6017, + "task_loss": 0.3768693804740906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4156798720359802, + "epoch": 5.09, + "learning_rate": 1.3663808718753773e-05, + "loss": 0.4659, + "step": 6018, + "task_loss": 0.809343159198761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46982911229133606, + "epoch": 5.09, + "learning_rate": 1.3657770800627945e-05, + "loss": 0.5185, + "step": 6019, + "task_loss": 0.10722782462835312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4374825954437256, + "epoch": 5.09, + "learning_rate": 1.3651732882502114e-05, + "loss": 0.4828, + "step": 6020, + "task_loss": 0.6165213584899902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.727118730545044, + "epoch": 5.09, + "learning_rate": 1.3645694964376285e-05, + "loss": 0.6018, + "step": 6021, + "task_loss": 1.128334641456604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30963483452796936, + "epoch": 5.09, + "learning_rate": 1.3639657046250454e-05, + "loss": 0.4595, + "step": 6022, + "task_loss": 0.8902470469474792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5258951187133789, + "epoch": 5.09, + "learning_rate": 1.3633619128124622e-05, + "loss": 0.5562, + "step": 6023, + "task_loss": 0.41981035470962524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7069607973098755, + "epoch": 5.09, + "learning_rate": 1.3627581209998793e-05, + "loss": 0.5191, + "step": 6024, + "task_loss": 0.2582801580429077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6058889627456665, + "epoch": 5.09, + "learning_rate": 1.3621543291872962e-05, + "loss": 0.465, + "step": 6025, + "task_loss": 0.5562323927879333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39247676730155945, + "epoch": 5.09, + "learning_rate": 1.3615505373747134e-05, + "loss": 0.3144, + "step": 6026, + "task_loss": 0.25659263134002686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4586787223815918, + "epoch": 5.09, + "learning_rate": 1.3609467455621303e-05, + "loss": 0.5015, + "step": 6027, + "task_loss": 0.20159313082695007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4671986699104309, + "epoch": 5.1, + "learning_rate": 1.3603429537495472e-05, + "loss": 0.4835, + "step": 6028, + "task_loss": 1.1619223356246948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26620790362358093, + "epoch": 5.1, + "learning_rate": 1.3597391619369642e-05, + "loss": 0.598, + "step": 6029, + "task_loss": 0.405183345079422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5016902685165405, + "epoch": 5.1, + "learning_rate": 1.3591353701243811e-05, + "loss": 0.4632, + "step": 6030, + "task_loss": 1.0069116353988647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2515528202056885, + "epoch": 5.1, + "learning_rate": 1.3585315783117984e-05, + "loss": 0.3232, + "step": 6031, + "task_loss": 0.22121688723564148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43135589361190796, + "epoch": 5.1, + "learning_rate": 1.357927786499215e-05, + "loss": 0.4431, + "step": 6032, + "task_loss": 0.4904557764530182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5272772908210754, + "epoch": 5.1, + "learning_rate": 1.357323994686632e-05, + "loss": 0.5532, + "step": 6033, + "task_loss": 1.227022409439087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3377170264720917, + "epoch": 5.1, + "learning_rate": 1.3567202028740492e-05, + "loss": 0.3893, + "step": 6034, + "task_loss": 0.10974021255970001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5595309734344482, + "epoch": 5.1, + "learning_rate": 1.356116411061466e-05, + "loss": 0.4964, + "step": 6035, + "task_loss": 0.7340883612632751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48726242780685425, + "epoch": 5.1, + "learning_rate": 1.3555126192488831e-05, + "loss": 0.5724, + "step": 6036, + "task_loss": 0.26538988947868347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6428620219230652, + "epoch": 5.1, + "learning_rate": 1.3549088274363e-05, + "loss": 0.4895, + "step": 6037, + "task_loss": 0.5712012648582458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40082937479019165, + "epoch": 5.1, + "learning_rate": 1.3543050356237169e-05, + "loss": 0.5232, + "step": 6038, + "task_loss": 0.58014976978302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28909146785736084, + "epoch": 5.1, + "learning_rate": 1.3537012438111341e-05, + "loss": 0.4127, + "step": 6039, + "task_loss": 0.4486733078956604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31982994079589844, + "epoch": 5.11, + "learning_rate": 1.3530974519985509e-05, + "loss": 0.331, + "step": 6040, + "task_loss": 0.4047614336013794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3360825479030609, + "epoch": 5.11, + "learning_rate": 1.352493660185968e-05, + "loss": 0.5412, + "step": 6041, + "task_loss": 0.18505176901817322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7931953072547913, + "epoch": 5.11, + "learning_rate": 1.351889868373385e-05, + "loss": 0.5883, + "step": 6042, + "task_loss": 0.7573176622390747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45055651664733887, + "epoch": 5.11, + "learning_rate": 1.3512860765608019e-05, + "loss": 0.4752, + "step": 6043, + "task_loss": 0.8979575634002686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.263220876455307, + "epoch": 5.11, + "learning_rate": 1.350682284748219e-05, + "loss": 0.4862, + "step": 6044, + "task_loss": 1.0195878744125366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3075276017189026, + "epoch": 5.11, + "learning_rate": 1.3500784929356358e-05, + "loss": 0.5362, + "step": 6045, + "task_loss": 0.7288013100624084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.361200749874115, + "epoch": 5.11, + "learning_rate": 1.349474701123053e-05, + "loss": 0.3927, + "step": 6046, + "task_loss": 1.2523176670074463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.666648268699646, + "epoch": 5.11, + "learning_rate": 1.3488709093104697e-05, + "loss": 0.4789, + "step": 6047, + "task_loss": 0.1753956377506256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2796754240989685, + "epoch": 5.11, + "learning_rate": 1.3482671174978866e-05, + "loss": 0.3953, + "step": 6048, + "task_loss": 0.13282561302185059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46063441038131714, + "epoch": 5.11, + "learning_rate": 1.3476633256853039e-05, + "loss": 0.4075, + "step": 6049, + "task_loss": 0.5712062120437622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4028043746948242, + "epoch": 5.11, + "learning_rate": 1.3470595338727207e-05, + "loss": 0.4733, + "step": 6050, + "task_loss": 1.1346734762191772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4233834743499756, + "epoch": 5.11, + "learning_rate": 1.3464557420601378e-05, + "loss": 0.4924, + "step": 6051, + "task_loss": 1.0581886768341064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28586089611053467, + "epoch": 5.12, + "learning_rate": 1.3458519502475547e-05, + "loss": 0.4679, + "step": 6052, + "task_loss": 0.4870016574859619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3542870283126831, + "epoch": 5.12, + "learning_rate": 1.3452481584349716e-05, + "loss": 0.4017, + "step": 6053, + "task_loss": 0.8707901239395142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4612223207950592, + "epoch": 5.12, + "learning_rate": 1.3446443666223888e-05, + "loss": 0.4379, + "step": 6054, + "task_loss": 0.9345571994781494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4924331307411194, + "epoch": 5.12, + "learning_rate": 1.3440405748098055e-05, + "loss": 0.4681, + "step": 6055, + "task_loss": 1.4472228288650513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40319788455963135, + "epoch": 5.12, + "learning_rate": 1.3434367829972228e-05, + "loss": 0.5131, + "step": 6056, + "task_loss": 0.36424267292022705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47285884618759155, + "epoch": 5.12, + "learning_rate": 1.3428329911846396e-05, + "loss": 0.352, + "step": 6057, + "task_loss": 0.559246838092804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4122989773750305, + "epoch": 5.12, + "learning_rate": 1.3422291993720565e-05, + "loss": 0.5335, + "step": 6058, + "task_loss": 0.477403461933136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42810606956481934, + "epoch": 5.12, + "learning_rate": 1.3416254075594736e-05, + "loss": 0.4809, + "step": 6059, + "task_loss": 0.62047278881073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30597326159477234, + "epoch": 5.12, + "learning_rate": 1.3410216157468905e-05, + "loss": 0.3824, + "step": 6060, + "task_loss": 0.16076746582984924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6116683483123779, + "epoch": 5.12, + "learning_rate": 1.3404178239343077e-05, + "loss": 0.6654, + "step": 6061, + "task_loss": 2.1852352619171143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2726067900657654, + "epoch": 5.12, + "learning_rate": 1.3398140321217246e-05, + "loss": 0.3771, + "step": 6062, + "task_loss": 0.23471403121948242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3169618248939514, + "epoch": 5.13, + "learning_rate": 1.3392102403091413e-05, + "loss": 0.3747, + "step": 6063, + "task_loss": 0.10524037480354309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7403062582015991, + "epoch": 5.13, + "learning_rate": 1.3386064484965585e-05, + "loss": 0.4934, + "step": 6064, + "task_loss": 0.8136484622955322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5489674806594849, + "epoch": 5.13, + "learning_rate": 1.3380026566839754e-05, + "loss": 0.4284, + "step": 6065, + "task_loss": 0.5061486959457397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38511645793914795, + "epoch": 5.13, + "learning_rate": 1.3373988648713925e-05, + "loss": 0.4308, + "step": 6066, + "task_loss": 0.4862314462661743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3233916759490967, + "epoch": 5.13, + "learning_rate": 1.3367950730588094e-05, + "loss": 0.5189, + "step": 6067, + "task_loss": 0.5849013328552246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4537801742553711, + "epoch": 5.13, + "learning_rate": 1.3361912812462263e-05, + "loss": 0.6275, + "step": 6068, + "task_loss": 0.7423861026763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1965762972831726, + "epoch": 5.13, + "learning_rate": 1.3355874894336435e-05, + "loss": 0.4594, + "step": 6069, + "task_loss": 0.18746179342269897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40123844146728516, + "epoch": 5.13, + "learning_rate": 1.3349836976210604e-05, + "loss": 0.5237, + "step": 6070, + "task_loss": 0.7818856835365295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6137521266937256, + "epoch": 5.13, + "learning_rate": 1.3343799058084774e-05, + "loss": 0.5028, + "step": 6071, + "task_loss": 1.7472507953643799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5355037450790405, + "epoch": 5.13, + "learning_rate": 1.3337761139958943e-05, + "loss": 0.5622, + "step": 6072, + "task_loss": 1.208091378211975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32960784435272217, + "epoch": 5.13, + "learning_rate": 1.3331723221833112e-05, + "loss": 0.4141, + "step": 6073, + "task_loss": 0.3296586275100708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.446250855922699, + "epoch": 5.13, + "learning_rate": 1.3325685303707283e-05, + "loss": 0.5339, + "step": 6074, + "task_loss": 0.9995641708374023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6211857199668884, + "epoch": 5.14, + "learning_rate": 1.3319647385581451e-05, + "loss": 0.5358, + "step": 6075, + "task_loss": 0.7238313555717468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24836868047714233, + "epoch": 5.14, + "learning_rate": 1.3313609467455624e-05, + "loss": 0.4539, + "step": 6076, + "task_loss": 0.028985394164919853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7105672359466553, + "epoch": 5.14, + "learning_rate": 1.3307571549329793e-05, + "loss": 0.5364, + "step": 6077, + "task_loss": 1.4642765522003174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29775524139404297, + "epoch": 5.14, + "learning_rate": 1.3301533631203961e-05, + "loss": 0.4736, + "step": 6078, + "task_loss": 1.415164589881897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42882752418518066, + "epoch": 5.14, + "learning_rate": 1.3295495713078132e-05, + "loss": 0.4568, + "step": 6079, + "task_loss": 0.4393989145755768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28087925910949707, + "epoch": 5.14, + "learning_rate": 1.3289457794952301e-05, + "loss": 0.4659, + "step": 6080, + "task_loss": 0.3817955553531647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3529781103134155, + "epoch": 5.14, + "learning_rate": 1.328341987682647e-05, + "loss": 0.4628, + "step": 6081, + "task_loss": 0.14105166494846344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2521512508392334, + "epoch": 5.14, + "learning_rate": 1.327738195870064e-05, + "loss": 0.4198, + "step": 6082, + "task_loss": 0.8012117743492126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26989591121673584, + "epoch": 5.14, + "learning_rate": 1.327134404057481e-05, + "loss": 0.5553, + "step": 6083, + "task_loss": 0.8267058730125427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3994668424129486, + "epoch": 5.14, + "learning_rate": 1.3265306122448982e-05, + "loss": 0.4681, + "step": 6084, + "task_loss": 0.7029871940612793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2228318452835083, + "epoch": 5.14, + "learning_rate": 1.325926820432315e-05, + "loss": 0.56, + "step": 6085, + "task_loss": 0.6692995429039001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47748300433158875, + "epoch": 5.14, + "learning_rate": 1.325323028619732e-05, + "loss": 0.5437, + "step": 6086, + "task_loss": 1.038307547569275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7062476277351379, + "epoch": 5.15, + "learning_rate": 1.324719236807149e-05, + "loss": 0.5343, + "step": 6087, + "task_loss": 0.4333077371120453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.517061173915863, + "epoch": 5.15, + "learning_rate": 1.3241154449945659e-05, + "loss": 0.5794, + "step": 6088, + "task_loss": 0.7897264957427979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48791441321372986, + "epoch": 5.15, + "learning_rate": 1.323511653181983e-05, + "loss": 0.4022, + "step": 6089, + "task_loss": 0.5231915712356567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2645205557346344, + "epoch": 5.15, + "learning_rate": 1.3229078613693998e-05, + "loss": 0.4335, + "step": 6090, + "task_loss": 0.1047622561454773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40866953134536743, + "epoch": 5.15, + "learning_rate": 1.3223040695568167e-05, + "loss": 0.424, + "step": 6091, + "task_loss": 0.47545912861824036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.516728401184082, + "epoch": 5.15, + "learning_rate": 1.321700277744234e-05, + "loss": 0.452, + "step": 6092, + "task_loss": 0.6408304572105408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4165635108947754, + "epoch": 5.15, + "learning_rate": 1.3210964859316508e-05, + "loss": 0.4679, + "step": 6093, + "task_loss": 0.5097827911376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34020093083381653, + "epoch": 5.15, + "learning_rate": 1.3204926941190679e-05, + "loss": 0.4782, + "step": 6094, + "task_loss": 0.34038764238357544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35751307010650635, + "epoch": 5.15, + "learning_rate": 1.3198889023064848e-05, + "loss": 0.4585, + "step": 6095, + "task_loss": 0.8555724024772644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.754215657711029, + "epoch": 5.15, + "learning_rate": 1.3192851104939017e-05, + "loss": 0.6339, + "step": 6096, + "task_loss": 1.6852246522903442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3186350464820862, + "epoch": 5.15, + "learning_rate": 1.3186813186813187e-05, + "loss": 0.3725, + "step": 6097, + "task_loss": 1.0746930837631226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5849156975746155, + "epoch": 5.15, + "learning_rate": 1.3180775268687356e-05, + "loss": 0.6488, + "step": 6098, + "task_loss": 1.3260470628738403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42752474546432495, + "epoch": 5.16, + "learning_rate": 1.3174737350561528e-05, + "loss": 0.5524, + "step": 6099, + "task_loss": 0.8044757843017578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2714477777481079, + "epoch": 5.16, + "learning_rate": 1.3168699432435697e-05, + "loss": 0.467, + "step": 6100, + "task_loss": 0.45615917444229126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5291986465454102, + "epoch": 5.16, + "learning_rate": 1.3162661514309866e-05, + "loss": 0.6256, + "step": 6101, + "task_loss": 2.1692802906036377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5348755717277527, + "epoch": 5.16, + "learning_rate": 1.3156623596184037e-05, + "loss": 0.5042, + "step": 6102, + "task_loss": 0.2975325286388397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8818687200546265, + "epoch": 5.16, + "learning_rate": 1.3150585678058205e-05, + "loss": 0.5393, + "step": 6103, + "task_loss": 1.2022050619125366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.442829966545105, + "epoch": 5.16, + "learning_rate": 1.3144547759932378e-05, + "loss": 0.3998, + "step": 6104, + "task_loss": 0.4400947093963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.460046648979187, + "epoch": 5.16, + "learning_rate": 1.3138509841806545e-05, + "loss": 0.3714, + "step": 6105, + "task_loss": 0.2084684669971466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5473044514656067, + "epoch": 5.16, + "learning_rate": 1.3132471923680714e-05, + "loss": 0.4591, + "step": 6106, + "task_loss": 0.363830029964447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.374272882938385, + "epoch": 5.16, + "learning_rate": 1.3126434005554886e-05, + "loss": 0.4842, + "step": 6107, + "task_loss": 0.4404298663139343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29356062412261963, + "epoch": 5.16, + "learning_rate": 1.3120396087429055e-05, + "loss": 0.5056, + "step": 6108, + "task_loss": 0.3627863824367523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4361107349395752, + "epoch": 5.16, + "learning_rate": 1.3114358169303225e-05, + "loss": 0.5519, + "step": 6109, + "task_loss": 1.5569052696228027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4370635747909546, + "epoch": 5.16, + "learning_rate": 1.3108320251177394e-05, + "loss": 0.3546, + "step": 6110, + "task_loss": 1.0480105876922607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4578360915184021, + "epoch": 5.17, + "learning_rate": 1.3102282333051563e-05, + "loss": 0.4592, + "step": 6111, + "task_loss": 0.5150473713874817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5022355318069458, + "epoch": 5.17, + "learning_rate": 1.3096244414925734e-05, + "loss": 0.5014, + "step": 6112, + "task_loss": 0.7386837601661682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35836079716682434, + "epoch": 5.17, + "learning_rate": 1.3090206496799903e-05, + "loss": 0.5635, + "step": 6113, + "task_loss": 0.46500322222709656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45011839270591736, + "epoch": 5.17, + "learning_rate": 1.3084168578674075e-05, + "loss": 0.5577, + "step": 6114, + "task_loss": 0.7670868039131165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39528197050094604, + "epoch": 5.17, + "learning_rate": 1.3078130660548244e-05, + "loss": 0.4614, + "step": 6115, + "task_loss": 0.14794820547103882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5256476402282715, + "epoch": 5.17, + "learning_rate": 1.3072092742422413e-05, + "loss": 0.4468, + "step": 6116, + "task_loss": 0.8338684439659119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31768783926963806, + "epoch": 5.17, + "learning_rate": 1.3066054824296583e-05, + "loss": 0.4017, + "step": 6117, + "task_loss": 0.35649555921554565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4154629111289978, + "epoch": 5.17, + "learning_rate": 1.3060016906170752e-05, + "loss": 0.4602, + "step": 6118, + "task_loss": 0.9593022465705872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.100689172744751, + "epoch": 5.17, + "learning_rate": 1.3053978988044924e-05, + "loss": 0.6215, + "step": 6119, + "task_loss": 0.8713032603263855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25431886315345764, + "epoch": 5.17, + "learning_rate": 1.3047941069919092e-05, + "loss": 0.382, + "step": 6120, + "task_loss": 0.19224536418914795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29706811904907227, + "epoch": 5.17, + "learning_rate": 1.304190315179326e-05, + "loss": 0.4793, + "step": 6121, + "task_loss": 1.097525715827942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4011467397212982, + "epoch": 5.17, + "learning_rate": 1.3035865233667433e-05, + "loss": 0.5848, + "step": 6122, + "task_loss": 0.6773102283477783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6201338768005371, + "epoch": 5.18, + "learning_rate": 1.3029827315541602e-05, + "loss": 0.4676, + "step": 6123, + "task_loss": 0.5619208812713623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.574016809463501, + "epoch": 5.18, + "learning_rate": 1.3023789397415772e-05, + "loss": 0.4973, + "step": 6124, + "task_loss": 0.7853434681892395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49159732460975647, + "epoch": 5.18, + "learning_rate": 1.3017751479289941e-05, + "loss": 0.4246, + "step": 6125, + "task_loss": 0.8851189613342285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7109062671661377, + "epoch": 5.18, + "learning_rate": 1.301171356116411e-05, + "loss": 0.5848, + "step": 6126, + "task_loss": 0.7335724830627441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33137527108192444, + "epoch": 5.18, + "learning_rate": 1.3005675643038282e-05, + "loss": 0.3636, + "step": 6127, + "task_loss": 0.07704176008701324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7831236124038696, + "epoch": 5.18, + "learning_rate": 1.299963772491245e-05, + "loss": 0.4737, + "step": 6128, + "task_loss": 0.3302163779735565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5318394303321838, + "epoch": 5.18, + "learning_rate": 1.2993599806786622e-05, + "loss": 0.4525, + "step": 6129, + "task_loss": 0.9210467338562012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3585624098777771, + "epoch": 5.18, + "learning_rate": 1.298756188866079e-05, + "loss": 0.4353, + "step": 6130, + "task_loss": 0.7049155831336975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37324902415275574, + "epoch": 5.18, + "learning_rate": 1.298152397053496e-05, + "loss": 0.4852, + "step": 6131, + "task_loss": 1.2507636547088623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49834734201431274, + "epoch": 5.18, + "learning_rate": 1.297548605240913e-05, + "loss": 0.4704, + "step": 6132, + "task_loss": 0.9661620855331421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3737214207649231, + "epoch": 5.18, + "learning_rate": 1.2969448134283299e-05, + "loss": 0.4332, + "step": 6133, + "task_loss": 0.39389070868492126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6448582410812378, + "epoch": 5.19, + "learning_rate": 1.2963410216157471e-05, + "loss": 0.5645, + "step": 6134, + "task_loss": 0.9474178552627563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5322898626327515, + "epoch": 5.19, + "learning_rate": 1.295737229803164e-05, + "loss": 0.4236, + "step": 6135, + "task_loss": 0.6148312091827393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.378815621137619, + "epoch": 5.19, + "learning_rate": 1.2951334379905807e-05, + "loss": 0.395, + "step": 6136, + "task_loss": 0.9980775713920593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6200602650642395, + "epoch": 5.19, + "learning_rate": 1.294529646177998e-05, + "loss": 0.4538, + "step": 6137, + "task_loss": 1.2150746583938599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38590967655181885, + "epoch": 5.19, + "learning_rate": 1.2939258543654148e-05, + "loss": 0.442, + "step": 6138, + "task_loss": 0.639130175113678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3706420063972473, + "epoch": 5.19, + "learning_rate": 1.2933220625528319e-05, + "loss": 0.3614, + "step": 6139, + "task_loss": 0.08359649777412415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47983962297439575, + "epoch": 5.19, + "learning_rate": 1.2927182707402488e-05, + "loss": 0.3774, + "step": 6140, + "task_loss": 0.1693434864282608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5580073595046997, + "epoch": 5.19, + "learning_rate": 1.2921144789276657e-05, + "loss": 0.4991, + "step": 6141, + "task_loss": 0.9490946531295776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36639636754989624, + "epoch": 5.19, + "learning_rate": 1.2915106871150829e-05, + "loss": 0.4909, + "step": 6142, + "task_loss": 0.3023679256439209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3850104510784149, + "epoch": 5.19, + "learning_rate": 1.2909068953024998e-05, + "loss": 0.5837, + "step": 6143, + "task_loss": 0.10401973128318787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4688534736633301, + "epoch": 5.19, + "learning_rate": 1.2903031034899168e-05, + "loss": 0.4782, + "step": 6144, + "task_loss": 0.6819477677345276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4195156693458557, + "epoch": 5.19, + "learning_rate": 1.2896993116773337e-05, + "loss": 0.5362, + "step": 6145, + "task_loss": 0.8169540762901306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24895665049552917, + "epoch": 5.2, + "learning_rate": 1.2890955198647506e-05, + "loss": 0.3616, + "step": 6146, + "task_loss": 0.8087191581726074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37978583574295044, + "epoch": 5.2, + "learning_rate": 1.2884917280521677e-05, + "loss": 0.464, + "step": 6147, + "task_loss": 0.2558240294456482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2798442840576172, + "epoch": 5.2, + "learning_rate": 1.2878879362395846e-05, + "loss": 0.3981, + "step": 6148, + "task_loss": 0.7067782878875732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4613914489746094, + "epoch": 5.2, + "learning_rate": 1.2872841444270018e-05, + "loss": 0.3899, + "step": 6149, + "task_loss": 0.5899757146835327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2541161775588989, + "epoch": 5.2, + "learning_rate": 1.2866803526144187e-05, + "loss": 0.4244, + "step": 6150, + "task_loss": 0.5561696290969849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2999497652053833, + "epoch": 5.2, + "learning_rate": 1.2860765608018356e-05, + "loss": 0.3665, + "step": 6151, + "task_loss": 0.3899548649787903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2719145715236664, + "epoch": 5.2, + "learning_rate": 1.2854727689892526e-05, + "loss": 0.4217, + "step": 6152, + "task_loss": 0.22736620903015137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4711979925632477, + "epoch": 5.2, + "learning_rate": 1.2848689771766695e-05, + "loss": 0.4927, + "step": 6153, + "task_loss": 0.9362608790397644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2930052876472473, + "epoch": 5.2, + "learning_rate": 1.2842651853640866e-05, + "loss": 0.547, + "step": 6154, + "task_loss": 0.4726117253303528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6689135432243347, + "epoch": 5.2, + "learning_rate": 1.2836613935515034e-05, + "loss": 0.4096, + "step": 6155, + "task_loss": 0.9032557010650635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.424170583486557, + "epoch": 5.2, + "learning_rate": 1.2830576017389203e-05, + "loss": 0.5204, + "step": 6156, + "task_loss": 1.4358208179473877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.633333146572113, + "epoch": 5.2, + "learning_rate": 1.2824538099263376e-05, + "loss": 0.5292, + "step": 6157, + "task_loss": 1.0741466283798218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37188035249710083, + "epoch": 5.21, + "learning_rate": 1.2818500181137544e-05, + "loss": 0.3608, + "step": 6158, + "task_loss": 0.919540286064148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40353313088417053, + "epoch": 5.21, + "learning_rate": 1.2812462263011715e-05, + "loss": 0.4106, + "step": 6159, + "task_loss": 0.6638248562812805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32889360189437866, + "epoch": 5.21, + "learning_rate": 1.2806424344885884e-05, + "loss": 0.3727, + "step": 6160, + "task_loss": 0.14907263219356537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44253841042518616, + "epoch": 5.21, + "learning_rate": 1.2800386426760053e-05, + "loss": 0.5595, + "step": 6161, + "task_loss": 0.4107782542705536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5456867814064026, + "epoch": 5.21, + "learning_rate": 1.2794348508634223e-05, + "loss": 0.5208, + "step": 6162, + "task_loss": 1.5941705703735352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.493611216545105, + "epoch": 5.21, + "learning_rate": 1.2788310590508392e-05, + "loss": 0.5042, + "step": 6163, + "task_loss": 1.0247541666030884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3051633834838867, + "epoch": 5.21, + "learning_rate": 1.2782272672382565e-05, + "loss": 0.3439, + "step": 6164, + "task_loss": 0.2997063398361206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2845070958137512, + "epoch": 5.21, + "learning_rate": 1.2776234754256733e-05, + "loss": 0.4036, + "step": 6165, + "task_loss": 0.674467146396637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38586676120758057, + "epoch": 5.21, + "learning_rate": 1.2770196836130902e-05, + "loss": 0.3496, + "step": 6166, + "task_loss": 0.29818442463874817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45291945338249207, + "epoch": 5.21, + "learning_rate": 1.2764158918005073e-05, + "loss": 0.6866, + "step": 6167, + "task_loss": 0.47581860423088074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5666059851646423, + "epoch": 5.21, + "learning_rate": 1.2758120999879242e-05, + "loss": 0.5422, + "step": 6168, + "task_loss": 0.47892069816589355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2942771911621094, + "epoch": 5.21, + "learning_rate": 1.2752083081753414e-05, + "loss": 0.4333, + "step": 6169, + "task_loss": 0.787386417388916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40701723098754883, + "epoch": 5.22, + "learning_rate": 1.2746045163627581e-05, + "loss": 0.5495, + "step": 6170, + "task_loss": 0.9545518159866333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4142760932445526, + "epoch": 5.22, + "learning_rate": 1.274000724550175e-05, + "loss": 0.4105, + "step": 6171, + "task_loss": 1.2535786628723145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4161740243434906, + "epoch": 5.22, + "learning_rate": 1.2733969327375922e-05, + "loss": 0.4184, + "step": 6172, + "task_loss": 0.6351439952850342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4108370542526245, + "epoch": 5.22, + "learning_rate": 1.2727931409250091e-05, + "loss": 0.5608, + "step": 6173, + "task_loss": 0.3379417955875397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3723337650299072, + "epoch": 5.22, + "learning_rate": 1.2721893491124262e-05, + "loss": 0.3661, + "step": 6174, + "task_loss": 0.4654655456542969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38027501106262207, + "epoch": 5.22, + "learning_rate": 1.271585557299843e-05, + "loss": 0.5391, + "step": 6175, + "task_loss": 0.8787911534309387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33820420503616333, + "epoch": 5.22, + "learning_rate": 1.27098176548726e-05, + "loss": 0.53, + "step": 6176, + "task_loss": 1.0738377571105957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2917509377002716, + "epoch": 5.22, + "learning_rate": 1.270377973674677e-05, + "loss": 0.4206, + "step": 6177, + "task_loss": 0.7680242657661438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5530897378921509, + "epoch": 5.22, + "learning_rate": 1.2697741818620939e-05, + "loss": 0.421, + "step": 6178, + "task_loss": 1.0556491613388062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5422264337539673, + "epoch": 5.22, + "learning_rate": 1.2691703900495111e-05, + "loss": 0.566, + "step": 6179, + "task_loss": 0.9877161383628845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39203545451164246, + "epoch": 5.22, + "learning_rate": 1.268566598236928e-05, + "loss": 0.3573, + "step": 6180, + "task_loss": 0.2709059715270996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3794826567173004, + "epoch": 5.22, + "learning_rate": 1.2679628064243449e-05, + "loss": 0.4887, + "step": 6181, + "task_loss": 0.9890578985214233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4647165536880493, + "epoch": 5.23, + "learning_rate": 1.267359014611762e-05, + "loss": 0.4008, + "step": 6182, + "task_loss": 0.3363933563232422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5746468305587769, + "epoch": 5.23, + "learning_rate": 1.2667552227991788e-05, + "loss": 0.5554, + "step": 6183, + "task_loss": 1.390650987625122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.502027690410614, + "epoch": 5.23, + "learning_rate": 1.266151430986596e-05, + "loss": 0.5143, + "step": 6184, + "task_loss": 0.7353254556655884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3463137447834015, + "epoch": 5.23, + "learning_rate": 1.2655476391740128e-05, + "loss": 0.4681, + "step": 6185, + "task_loss": 1.13534414768219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44857269525527954, + "epoch": 5.23, + "learning_rate": 1.2649438473614297e-05, + "loss": 0.4778, + "step": 6186, + "task_loss": 0.6780846118927002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3229963183403015, + "epoch": 5.23, + "learning_rate": 1.2643400555488469e-05, + "loss": 0.3875, + "step": 6187, + "task_loss": 0.7619109749794006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44008105993270874, + "epoch": 5.23, + "learning_rate": 1.2637362637362638e-05, + "loss": 0.448, + "step": 6188, + "task_loss": 0.08472947776317596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2771660387516022, + "epoch": 5.23, + "learning_rate": 1.2631324719236808e-05, + "loss": 0.4453, + "step": 6189, + "task_loss": 0.1630460023880005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4467581510543823, + "epoch": 5.23, + "learning_rate": 1.2625286801110977e-05, + "loss": 0.491, + "step": 6190, + "task_loss": 0.2543722093105316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37369856238365173, + "epoch": 5.23, + "learning_rate": 1.2619248882985146e-05, + "loss": 0.3938, + "step": 6191, + "task_loss": 0.5769704580307007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4213883876800537, + "epoch": 5.23, + "learning_rate": 1.2613210964859318e-05, + "loss": 0.6675, + "step": 6192, + "task_loss": 0.6167872548103333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30494827032089233, + "epoch": 5.23, + "learning_rate": 1.2607173046733486e-05, + "loss": 0.3618, + "step": 6193, + "task_loss": 0.6337176561355591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4974883198738098, + "epoch": 5.24, + "learning_rate": 1.2601135128607658e-05, + "loss": 0.6062, + "step": 6194, + "task_loss": 0.8943815231323242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27975523471832275, + "epoch": 5.24, + "learning_rate": 1.2595097210481827e-05, + "loss": 0.5089, + "step": 6195, + "task_loss": 0.20117653906345367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30887091159820557, + "epoch": 5.24, + "learning_rate": 1.2589059292355996e-05, + "loss": 0.3348, + "step": 6196, + "task_loss": 0.378909170627594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35314202308654785, + "epoch": 5.24, + "learning_rate": 1.2583021374230166e-05, + "loss": 0.4535, + "step": 6197, + "task_loss": 0.6679708361625671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47578901052474976, + "epoch": 5.24, + "learning_rate": 1.2576983456104335e-05, + "loss": 0.4455, + "step": 6198, + "task_loss": 0.04629099369049072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40176236629486084, + "epoch": 5.24, + "learning_rate": 1.2570945537978507e-05, + "loss": 0.4337, + "step": 6199, + "task_loss": 0.5134381055831909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49800628423690796, + "epoch": 5.24, + "learning_rate": 1.2564907619852676e-05, + "loss": 0.4757, + "step": 6200, + "task_loss": 0.21907542645931244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20796847343444824, + "epoch": 5.24, + "learning_rate": 1.2558869701726843e-05, + "loss": 0.3775, + "step": 6201, + "task_loss": 0.034181609749794006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5537691116333008, + "epoch": 5.24, + "learning_rate": 1.2552831783601016e-05, + "loss": 0.5178, + "step": 6202, + "task_loss": 0.7014821767807007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33283016085624695, + "epoch": 5.24, + "learning_rate": 1.2546793865475185e-05, + "loss": 0.4908, + "step": 6203, + "task_loss": 0.24621935188770294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4907032251358032, + "epoch": 5.24, + "learning_rate": 1.2540755947349355e-05, + "loss": 0.5118, + "step": 6204, + "task_loss": 0.18759043514728546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.342742919921875, + "epoch": 5.24, + "learning_rate": 1.2534718029223524e-05, + "loss": 0.4898, + "step": 6205, + "task_loss": 0.7270870208740234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4542190730571747, + "epoch": 5.25, + "learning_rate": 1.2528680111097693e-05, + "loss": 0.4588, + "step": 6206, + "task_loss": 0.7517849206924438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.451686829328537, + "epoch": 5.25, + "learning_rate": 1.2522642192971865e-05, + "loss": 0.4712, + "step": 6207, + "task_loss": 1.12918221950531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4083024859428406, + "epoch": 5.25, + "learning_rate": 1.2516604274846034e-05, + "loss": 0.4374, + "step": 6208, + "task_loss": 0.0881195217370987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5862739086151123, + "epoch": 5.25, + "learning_rate": 1.2510566356720205e-05, + "loss": 0.4768, + "step": 6209, + "task_loss": 0.39463233947753906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5894033312797546, + "epoch": 5.25, + "learning_rate": 1.2504528438594374e-05, + "loss": 0.5225, + "step": 6210, + "task_loss": 0.5741052627563477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5622382164001465, + "epoch": 5.25, + "learning_rate": 1.2498490520468544e-05, + "loss": 0.6311, + "step": 6211, + "task_loss": 0.6506113409996033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7736029624938965, + "epoch": 5.25, + "learning_rate": 1.2492452602342713e-05, + "loss": 0.4799, + "step": 6212, + "task_loss": 0.6603603959083557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2468295395374298, + "epoch": 5.25, + "learning_rate": 1.2486414684216882e-05, + "loss": 0.4483, + "step": 6213, + "task_loss": 0.14578548073768616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6984173059463501, + "epoch": 5.25, + "learning_rate": 1.2480376766091052e-05, + "loss": 0.5554, + "step": 6214, + "task_loss": 1.0485152006149292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41591858863830566, + "epoch": 5.25, + "learning_rate": 1.2474338847965223e-05, + "loss": 0.4275, + "step": 6215, + "task_loss": 1.4896053075790405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.602888822555542, + "epoch": 5.25, + "learning_rate": 1.2468300929839392e-05, + "loss": 0.49, + "step": 6216, + "task_loss": 1.1430680751800537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31008094549179077, + "epoch": 5.26, + "learning_rate": 1.246226301171356e-05, + "loss": 0.3567, + "step": 6217, + "task_loss": 0.8474169969558716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.13234040141105652, + "epoch": 5.26, + "learning_rate": 1.2456225093587731e-05, + "loss": 0.3969, + "step": 6218, + "task_loss": 0.6200963258743286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2713158428668976, + "epoch": 5.26, + "learning_rate": 1.2450187175461902e-05, + "loss": 0.3204, + "step": 6219, + "task_loss": 0.11687406152486801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48578497767448425, + "epoch": 5.26, + "learning_rate": 1.244414925733607e-05, + "loss": 0.5868, + "step": 6220, + "task_loss": 0.9350218176841736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5125357508659363, + "epoch": 5.26, + "learning_rate": 1.2438111339210241e-05, + "loss": 0.4309, + "step": 6221, + "task_loss": 0.16614003479480743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25036102533340454, + "epoch": 5.26, + "learning_rate": 1.243207342108441e-05, + "loss": 0.3975, + "step": 6222, + "task_loss": 0.17539043724536896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3514832556247711, + "epoch": 5.26, + "learning_rate": 1.242603550295858e-05, + "loss": 0.468, + "step": 6223, + "task_loss": 0.16145166754722595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5061763525009155, + "epoch": 5.26, + "learning_rate": 1.241999758483275e-05, + "loss": 0.4297, + "step": 6224, + "task_loss": 0.1512872576713562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5372095108032227, + "epoch": 5.26, + "learning_rate": 1.241395966670692e-05, + "loss": 0.5252, + "step": 6225, + "task_loss": 1.8842012882232666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3596974015235901, + "epoch": 5.26, + "learning_rate": 1.240792174858109e-05, + "loss": 0.3819, + "step": 6226, + "task_loss": 0.687288224697113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4767927825450897, + "epoch": 5.26, + "learning_rate": 1.240188383045526e-05, + "loss": 0.4024, + "step": 6227, + "task_loss": 0.7534205317497253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6467216610908508, + "epoch": 5.26, + "learning_rate": 1.2395845912329429e-05, + "loss": 0.6204, + "step": 6228, + "task_loss": 1.6777626276016235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4914214611053467, + "epoch": 5.27, + "learning_rate": 1.2389807994203599e-05, + "loss": 0.4628, + "step": 6229, + "task_loss": 0.7413581013679504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4860031008720398, + "epoch": 5.27, + "learning_rate": 1.238377007607777e-05, + "loss": 0.4634, + "step": 6230, + "task_loss": 0.4979691505432129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28255903720855713, + "epoch": 5.27, + "learning_rate": 1.2377732157951939e-05, + "loss": 0.4363, + "step": 6231, + "task_loss": 0.2801055610179901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.11712111532688141, + "epoch": 5.27, + "learning_rate": 1.2371694239826107e-05, + "loss": 0.4164, + "step": 6232, + "task_loss": 0.02010384574532509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5305463075637817, + "epoch": 5.27, + "learning_rate": 1.2365656321700278e-05, + "loss": 0.5305, + "step": 6233, + "task_loss": 1.7402743101119995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3652735948562622, + "epoch": 5.27, + "learning_rate": 1.2359618403574449e-05, + "loss": 0.5376, + "step": 6234, + "task_loss": 0.7852202653884888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35535115003585815, + "epoch": 5.27, + "learning_rate": 1.2353580485448617e-05, + "loss": 0.4457, + "step": 6235, + "task_loss": 0.2476380616426468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5949093699455261, + "epoch": 5.27, + "learning_rate": 1.2347542567322788e-05, + "loss": 0.4455, + "step": 6236, + "task_loss": 0.11201959103345871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4688261151313782, + "epoch": 5.27, + "learning_rate": 1.2341504649196957e-05, + "loss": 0.4586, + "step": 6237, + "task_loss": 0.3446193337440491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44974154233932495, + "epoch": 5.27, + "learning_rate": 1.2335466731071127e-05, + "loss": 0.4865, + "step": 6238, + "task_loss": 0.6242192983627319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5322498083114624, + "epoch": 5.27, + "learning_rate": 1.2329428812945296e-05, + "loss": 0.3348, + "step": 6239, + "task_loss": 0.3575191795825958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7311123609542847, + "epoch": 5.27, + "learning_rate": 1.2323390894819467e-05, + "loss": 0.5272, + "step": 6240, + "task_loss": 0.297283411026001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5120882987976074, + "epoch": 5.28, + "learning_rate": 1.2317352976693638e-05, + "loss": 0.4282, + "step": 6241, + "task_loss": 1.5276340246200562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6023414134979248, + "epoch": 5.28, + "learning_rate": 1.2311315058567806e-05, + "loss": 0.4943, + "step": 6242, + "task_loss": 0.6430397033691406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32146820425987244, + "epoch": 5.28, + "learning_rate": 1.2305277140441975e-05, + "loss": 0.4142, + "step": 6243, + "task_loss": 1.1973849534988403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2361760437488556, + "epoch": 5.28, + "learning_rate": 1.2299239222316146e-05, + "loss": 0.3907, + "step": 6244, + "task_loss": 0.5391025543212891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4394904375076294, + "epoch": 5.28, + "learning_rate": 1.2293201304190316e-05, + "loss": 0.4955, + "step": 6245, + "task_loss": 0.9576830863952637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5332996249198914, + "epoch": 5.28, + "learning_rate": 1.2287163386064485e-05, + "loss": 0.4263, + "step": 6246, + "task_loss": 0.47600847482681274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5218952894210815, + "epoch": 5.28, + "learning_rate": 1.2281125467938654e-05, + "loss": 0.4282, + "step": 6247, + "task_loss": 0.42939120531082153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3090490400791168, + "epoch": 5.28, + "learning_rate": 1.2275087549812825e-05, + "loss": 0.4182, + "step": 6248, + "task_loss": 0.4032343029975891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5421972870826721, + "epoch": 5.28, + "learning_rate": 1.2269049631686995e-05, + "loss": 0.4019, + "step": 6249, + "task_loss": 1.0163698196411133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5863146781921387, + "epoch": 5.28, + "learning_rate": 1.2263011713561164e-05, + "loss": 0.4033, + "step": 6250, + "task_loss": 1.1737909317016602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.18500067293643951, + "epoch": 5.28, + "learning_rate": 1.2256973795435335e-05, + "loss": 0.529, + "step": 6251, + "task_loss": 1.091176986694336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44266489148139954, + "epoch": 5.28, + "learning_rate": 1.2250935877309504e-05, + "loss": 0.4928, + "step": 6252, + "task_loss": 0.6189447641372681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3941328227519989, + "epoch": 5.29, + "learning_rate": 1.2244897959183674e-05, + "loss": 0.4469, + "step": 6253, + "task_loss": 0.41837334632873535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4661310911178589, + "epoch": 5.29, + "learning_rate": 1.2238860041057843e-05, + "loss": 0.4981, + "step": 6254, + "task_loss": 0.5334681272506714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46993929147720337, + "epoch": 5.29, + "learning_rate": 1.2232822122932014e-05, + "loss": 0.4511, + "step": 6255, + "task_loss": 0.5718578696250916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6415864825248718, + "epoch": 5.29, + "learning_rate": 1.2226784204806184e-05, + "loss": 0.5874, + "step": 6256, + "task_loss": 1.166537880897522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5136687755584717, + "epoch": 5.29, + "learning_rate": 1.2220746286680353e-05, + "loss": 0.4656, + "step": 6257, + "task_loss": 0.42546728253364563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2938166856765747, + "epoch": 5.29, + "learning_rate": 1.2214708368554522e-05, + "loss": 0.5055, + "step": 6258, + "task_loss": 0.4821000397205353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5102195143699646, + "epoch": 5.29, + "learning_rate": 1.2208670450428693e-05, + "loss": 0.4723, + "step": 6259, + "task_loss": 0.3282877206802368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30259495973587036, + "epoch": 5.29, + "learning_rate": 1.2202632532302863e-05, + "loss": 0.5012, + "step": 6260, + "task_loss": 1.1107146739959717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41291919350624084, + "epoch": 5.29, + "learning_rate": 1.2196594614177034e-05, + "loss": 0.4243, + "step": 6261, + "task_loss": 0.42582619190216064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4471568167209625, + "epoch": 5.29, + "learning_rate": 1.2190556696051201e-05, + "loss": 0.3573, + "step": 6262, + "task_loss": 0.36311691999435425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.309916615486145, + "epoch": 5.29, + "learning_rate": 1.2184518777925371e-05, + "loss": 0.478, + "step": 6263, + "task_loss": 0.06853463500738144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3928259611129761, + "epoch": 5.29, + "learning_rate": 1.2178480859799542e-05, + "loss": 0.4288, + "step": 6264, + "task_loss": 0.7260500192642212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4291144907474518, + "epoch": 5.3, + "learning_rate": 1.2172442941673713e-05, + "loss": 0.4722, + "step": 6265, + "task_loss": 0.44616806507110596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44337618350982666, + "epoch": 5.3, + "learning_rate": 1.2166405023547881e-05, + "loss": 0.431, + "step": 6266, + "task_loss": 0.6352015733718872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2501221001148224, + "epoch": 5.3, + "learning_rate": 1.216036710542205e-05, + "loss": 0.3787, + "step": 6267, + "task_loss": 0.17508773505687714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37083515524864197, + "epoch": 5.3, + "learning_rate": 1.2154329187296221e-05, + "loss": 0.4316, + "step": 6268, + "task_loss": 0.5260753631591797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37162959575653076, + "epoch": 5.3, + "learning_rate": 1.2148291269170391e-05, + "loss": 0.3987, + "step": 6269, + "task_loss": 0.458967000246048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2888850271701813, + "epoch": 5.3, + "learning_rate": 1.214225335104456e-05, + "loss": 0.6822, + "step": 6270, + "task_loss": 0.034992583096027374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3902198374271393, + "epoch": 5.3, + "learning_rate": 1.2136215432918731e-05, + "loss": 0.4673, + "step": 6271, + "task_loss": 0.20604254305362701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4453846514225006, + "epoch": 5.3, + "learning_rate": 1.21301775147929e-05, + "loss": 0.5365, + "step": 6272, + "task_loss": 0.8998722434043884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.358317106962204, + "epoch": 5.3, + "learning_rate": 1.212413959666707e-05, + "loss": 0.6121, + "step": 6273, + "task_loss": 0.7969244718551636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2586548626422882, + "epoch": 5.3, + "learning_rate": 1.211810167854124e-05, + "loss": 0.3253, + "step": 6274, + "task_loss": 0.20838871598243713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2985243797302246, + "epoch": 5.3, + "learning_rate": 1.211206376041541e-05, + "loss": 0.3625, + "step": 6275, + "task_loss": 0.08204582333564758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3770885765552521, + "epoch": 5.3, + "learning_rate": 1.210602584228958e-05, + "loss": 0.4141, + "step": 6276, + "task_loss": 0.6199331283569336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.556286096572876, + "epoch": 5.31, + "learning_rate": 1.209998792416375e-05, + "loss": 0.4459, + "step": 6277, + "task_loss": 1.3019596338272095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6244060397148132, + "epoch": 5.31, + "learning_rate": 1.2093950006037918e-05, + "loss": 0.4963, + "step": 6278, + "task_loss": 0.35750284790992737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3581559360027313, + "epoch": 5.31, + "learning_rate": 1.2087912087912089e-05, + "loss": 0.3596, + "step": 6279, + "task_loss": 0.23802399635314941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8221668004989624, + "epoch": 5.31, + "learning_rate": 1.208187416978626e-05, + "loss": 0.6341, + "step": 6280, + "task_loss": 1.0204393863677979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24638521671295166, + "epoch": 5.31, + "learning_rate": 1.2075836251660428e-05, + "loss": 0.3426, + "step": 6281, + "task_loss": 0.553345263004303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3210230767726898, + "epoch": 5.31, + "learning_rate": 1.2069798333534597e-05, + "loss": 0.4162, + "step": 6282, + "task_loss": 0.5214501619338989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5407940745353699, + "epoch": 5.31, + "learning_rate": 1.2063760415408768e-05, + "loss": 0.4556, + "step": 6283, + "task_loss": 0.727148175239563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5306996703147888, + "epoch": 5.31, + "learning_rate": 1.2057722497282938e-05, + "loss": 0.5674, + "step": 6284, + "task_loss": 0.9735710024833679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6272885799407959, + "epoch": 5.31, + "learning_rate": 1.2051684579157107e-05, + "loss": 0.4548, + "step": 6285, + "task_loss": 0.955990731716156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4695785641670227, + "epoch": 5.31, + "learning_rate": 1.2045646661031278e-05, + "loss": 0.4691, + "step": 6286, + "task_loss": 0.44260072708129883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38654008507728577, + "epoch": 5.31, + "learning_rate": 1.2039608742905447e-05, + "loss": 0.3902, + "step": 6287, + "task_loss": 0.19603301584720612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6396925449371338, + "epoch": 5.32, + "learning_rate": 1.2033570824779617e-05, + "loss": 0.4629, + "step": 6288, + "task_loss": 1.0044513940811157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3387939929962158, + "epoch": 5.32, + "learning_rate": 1.2027532906653786e-05, + "loss": 0.383, + "step": 6289, + "task_loss": 0.30343323945999146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7601823210716248, + "epoch": 5.32, + "learning_rate": 1.2021494988527957e-05, + "loss": 0.5635, + "step": 6290, + "task_loss": 0.3581352233886719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33006906509399414, + "epoch": 5.32, + "learning_rate": 1.2015457070402127e-05, + "loss": 0.4231, + "step": 6291, + "task_loss": 0.6971384882926941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42513829469680786, + "epoch": 5.32, + "learning_rate": 1.2009419152276296e-05, + "loss": 0.3747, + "step": 6292, + "task_loss": 0.7543697953224182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8252288699150085, + "epoch": 5.32, + "learning_rate": 1.2003381234150465e-05, + "loss": 0.3949, + "step": 6293, + "task_loss": 0.9222071766853333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.610644519329071, + "epoch": 5.32, + "learning_rate": 1.1997343316024635e-05, + "loss": 0.4701, + "step": 6294, + "task_loss": 0.4978223145008087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7126344442367554, + "epoch": 5.32, + "learning_rate": 1.1991305397898806e-05, + "loss": 0.5721, + "step": 6295, + "task_loss": 0.9451578259468079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3959302306175232, + "epoch": 5.32, + "learning_rate": 1.1985267479772975e-05, + "loss": 0.4084, + "step": 6296, + "task_loss": 0.5355932712554932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4511442184448242, + "epoch": 5.32, + "learning_rate": 1.1979229561647144e-05, + "loss": 0.4798, + "step": 6297, + "task_loss": 1.2487976551055908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40686511993408203, + "epoch": 5.32, + "learning_rate": 1.1973191643521314e-05, + "loss": 0.3995, + "step": 6298, + "task_loss": 1.0678472518920898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6790695190429688, + "epoch": 5.32, + "learning_rate": 1.1967153725395485e-05, + "loss": 0.4439, + "step": 6299, + "task_loss": 0.9739238619804382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5380898118019104, + "epoch": 5.33, + "learning_rate": 1.1961115807269654e-05, + "loss": 0.5202, + "step": 6300, + "task_loss": 0.860866129398346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5985127091407776, + "epoch": 5.33, + "learning_rate": 1.1955077889143823e-05, + "loss": 0.5638, + "step": 6301, + "task_loss": 1.0473815202713013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44799453020095825, + "epoch": 5.33, + "learning_rate": 1.1949039971017993e-05, + "loss": 0.4479, + "step": 6302, + "task_loss": 0.5221502184867859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7269673943519592, + "epoch": 5.33, + "learning_rate": 1.1943002052892164e-05, + "loss": 0.5037, + "step": 6303, + "task_loss": 1.4101895093917847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23132182657718658, + "epoch": 5.33, + "learning_rate": 1.1936964134766333e-05, + "loss": 0.4106, + "step": 6304, + "task_loss": 1.0466407537460327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6625843048095703, + "epoch": 5.33, + "learning_rate": 1.1930926216640503e-05, + "loss": 0.5084, + "step": 6305, + "task_loss": 0.25994470715522766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3752896785736084, + "epoch": 5.33, + "learning_rate": 1.1924888298514672e-05, + "loss": 0.5054, + "step": 6306, + "task_loss": 0.49911820888519287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28908780217170715, + "epoch": 5.33, + "learning_rate": 1.1918850380388843e-05, + "loss": 0.4871, + "step": 6307, + "task_loss": 0.45031410455703735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3758608102798462, + "epoch": 5.33, + "learning_rate": 1.1912812462263012e-05, + "loss": 0.4788, + "step": 6308, + "task_loss": 0.5773899555206299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4675436019897461, + "epoch": 5.33, + "learning_rate": 1.1906774544137182e-05, + "loss": 0.4884, + "step": 6309, + "task_loss": 1.1176221370697021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2508590519428253, + "epoch": 5.33, + "learning_rate": 1.1900736626011353e-05, + "loss": 0.3087, + "step": 6310, + "task_loss": 0.05610812082886696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40795424580574036, + "epoch": 5.33, + "learning_rate": 1.1894698707885522e-05, + "loss": 0.3986, + "step": 6311, + "task_loss": 0.3809436857700348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3572429418563843, + "epoch": 5.34, + "learning_rate": 1.188866078975969e-05, + "loss": 0.4171, + "step": 6312, + "task_loss": 0.39259326457977295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8064528703689575, + "epoch": 5.34, + "learning_rate": 1.1882622871633861e-05, + "loss": 0.4943, + "step": 6313, + "task_loss": 0.9118799567222595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43614137172698975, + "epoch": 5.34, + "learning_rate": 1.1876584953508032e-05, + "loss": 0.419, + "step": 6314, + "task_loss": 0.22691699862480164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26580676436424255, + "epoch": 5.34, + "learning_rate": 1.18705470353822e-05, + "loss": 0.3712, + "step": 6315, + "task_loss": 0.8999698162078857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32874739170074463, + "epoch": 5.34, + "learning_rate": 1.186450911725637e-05, + "loss": 0.277, + "step": 6316, + "task_loss": 0.2942466735839844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39951837062835693, + "epoch": 5.34, + "learning_rate": 1.185847119913054e-05, + "loss": 0.4682, + "step": 6317, + "task_loss": 0.8691373467445374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31066566705703735, + "epoch": 5.34, + "learning_rate": 1.185243328100471e-05, + "loss": 0.3345, + "step": 6318, + "task_loss": 0.057947788387537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8941636085510254, + "epoch": 5.34, + "learning_rate": 1.184639536287888e-05, + "loss": 0.565, + "step": 6319, + "task_loss": 1.304322361946106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4426254332065582, + "epoch": 5.34, + "learning_rate": 1.184035744475305e-05, + "loss": 0.3892, + "step": 6320, + "task_loss": 1.3303678035736084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4537639915943146, + "epoch": 5.34, + "learning_rate": 1.1834319526627219e-05, + "loss": 0.4501, + "step": 6321, + "task_loss": 1.2391867637634277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5315889716148376, + "epoch": 5.34, + "learning_rate": 1.182828160850139e-05, + "loss": 0.5828, + "step": 6322, + "task_loss": 1.3790920972824097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1667761355638504, + "epoch": 5.34, + "learning_rate": 1.1822243690375558e-05, + "loss": 0.341, + "step": 6323, + "task_loss": 0.0829068273305893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4704621732234955, + "epoch": 5.35, + "learning_rate": 1.1816205772249729e-05, + "loss": 0.3422, + "step": 6324, + "task_loss": 0.30753093957901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5325760841369629, + "epoch": 5.35, + "learning_rate": 1.18101678541239e-05, + "loss": 0.5024, + "step": 6325, + "task_loss": 0.7614436745643616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49799850583076477, + "epoch": 5.35, + "learning_rate": 1.1804129935998068e-05, + "loss": 0.5632, + "step": 6326, + "task_loss": 0.31239959597587585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.383566677570343, + "epoch": 5.35, + "learning_rate": 1.1798092017872237e-05, + "loss": 0.4879, + "step": 6327, + "task_loss": 0.514318585395813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.779148519039154, + "epoch": 5.35, + "learning_rate": 1.1792054099746408e-05, + "loss": 0.5513, + "step": 6328, + "task_loss": 0.47329121828079224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5753949880599976, + "epoch": 5.35, + "learning_rate": 1.1786016181620578e-05, + "loss": 0.5285, + "step": 6329, + "task_loss": 0.5735852718353271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5698537230491638, + "epoch": 5.35, + "learning_rate": 1.1779978263494749e-05, + "loss": 0.4434, + "step": 6330, + "task_loss": 0.7977812886238098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4839312732219696, + "epoch": 5.35, + "learning_rate": 1.1773940345368916e-05, + "loss": 0.5419, + "step": 6331, + "task_loss": 0.7991262674331665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2376115620136261, + "epoch": 5.35, + "learning_rate": 1.1767902427243087e-05, + "loss": 0.376, + "step": 6332, + "task_loss": 0.6386260986328125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42174047231674194, + "epoch": 5.35, + "learning_rate": 1.1761864509117257e-05, + "loss": 0.4343, + "step": 6333, + "task_loss": 0.39548665285110474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3531261086463928, + "epoch": 5.35, + "learning_rate": 1.1755826590991428e-05, + "loss": 0.3574, + "step": 6334, + "task_loss": 0.19096443057060242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5499861836433411, + "epoch": 5.35, + "learning_rate": 1.1749788672865597e-05, + "loss": 0.512, + "step": 6335, + "task_loss": 0.3946600556373596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38409343361854553, + "epoch": 5.36, + "learning_rate": 1.1743750754739766e-05, + "loss": 0.3909, + "step": 6336, + "task_loss": 0.22466278076171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9586614370346069, + "epoch": 5.36, + "learning_rate": 1.1737712836613936e-05, + "loss": 0.7231, + "step": 6337, + "task_loss": 1.786522388458252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4369172155857086, + "epoch": 5.36, + "learning_rate": 1.1731674918488107e-05, + "loss": 0.4958, + "step": 6338, + "task_loss": 0.18028295040130615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7592207193374634, + "epoch": 5.36, + "learning_rate": 1.1725637000362276e-05, + "loss": 0.5847, + "step": 6339, + "task_loss": 1.1129190921783447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40526050329208374, + "epoch": 5.36, + "learning_rate": 1.1719599082236446e-05, + "loss": 0.4632, + "step": 6340, + "task_loss": 0.6580080986022949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6899394989013672, + "epoch": 5.36, + "learning_rate": 1.1713561164110615e-05, + "loss": 0.4546, + "step": 6341, + "task_loss": 1.111331582069397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3958185911178589, + "epoch": 5.36, + "learning_rate": 1.1707523245984786e-05, + "loss": 0.674, + "step": 6342, + "task_loss": 0.5113704800605774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6063418984413147, + "epoch": 5.36, + "learning_rate": 1.1701485327858954e-05, + "loss": 0.5251, + "step": 6343, + "task_loss": 0.5524447560310364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.344043105840683, + "epoch": 5.36, + "learning_rate": 1.1695447409733125e-05, + "loss": 0.5171, + "step": 6344, + "task_loss": 0.6112009882926941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3540215790271759, + "epoch": 5.36, + "learning_rate": 1.1689409491607296e-05, + "loss": 0.4928, + "step": 6345, + "task_loss": 0.3446291387081146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2772657573223114, + "epoch": 5.36, + "learning_rate": 1.1683371573481463e-05, + "loss": 0.2337, + "step": 6346, + "task_loss": 0.4272204339504242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33394855260849, + "epoch": 5.36, + "learning_rate": 1.1677333655355633e-05, + "loss": 0.4259, + "step": 6347, + "task_loss": 1.058383822441101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23304812610149384, + "epoch": 5.37, + "learning_rate": 1.1671295737229804e-05, + "loss": 0.4209, + "step": 6348, + "task_loss": 0.5153782963752747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3450447916984558, + "epoch": 5.37, + "learning_rate": 1.1665257819103974e-05, + "loss": 0.3658, + "step": 6349, + "task_loss": 0.3279207646846771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7239758968353271, + "epoch": 5.37, + "learning_rate": 1.1659219900978143e-05, + "loss": 0.5568, + "step": 6350, + "task_loss": 1.4685956239700317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4245326817035675, + "epoch": 5.37, + "learning_rate": 1.1653181982852312e-05, + "loss": 0.4062, + "step": 6351, + "task_loss": 0.5888448357582092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3318585157394409, + "epoch": 5.37, + "learning_rate": 1.1647144064726483e-05, + "loss": 0.3297, + "step": 6352, + "task_loss": 0.19040919840335846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.489673376083374, + "epoch": 5.37, + "learning_rate": 1.1641106146600653e-05, + "loss": 0.4339, + "step": 6353, + "task_loss": 1.7305190563201904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24561791121959686, + "epoch": 5.37, + "learning_rate": 1.1635068228474822e-05, + "loss": 0.4788, + "step": 6354, + "task_loss": 0.8127731084823608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6332089900970459, + "epoch": 5.37, + "learning_rate": 1.1629030310348993e-05, + "loss": 0.5449, + "step": 6355, + "task_loss": 0.5302404165267944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6534754037857056, + "epoch": 5.37, + "learning_rate": 1.1622992392223162e-05, + "loss": 0.5078, + "step": 6356, + "task_loss": 0.6126828193664551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40321585536003113, + "epoch": 5.37, + "learning_rate": 1.1616954474097332e-05, + "loss": 0.3717, + "step": 6357, + "task_loss": 0.22689639031887054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32186442613601685, + "epoch": 5.37, + "learning_rate": 1.1610916555971501e-05, + "loss": 0.3777, + "step": 6358, + "task_loss": 0.4278821349143982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5879184007644653, + "epoch": 5.38, + "learning_rate": 1.1604878637845672e-05, + "loss": 0.5787, + "step": 6359, + "task_loss": 0.6254206299781799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5147631168365479, + "epoch": 5.38, + "learning_rate": 1.1598840719719842e-05, + "loss": 0.5361, + "step": 6360, + "task_loss": 0.8641071915626526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3364253640174866, + "epoch": 5.38, + "learning_rate": 1.1592802801594011e-05, + "loss": 0.4698, + "step": 6361, + "task_loss": 0.44530194997787476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38226544857025146, + "epoch": 5.38, + "learning_rate": 1.158676488346818e-05, + "loss": 0.4317, + "step": 6362, + "task_loss": 0.7274138927459717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3393498659133911, + "epoch": 5.38, + "learning_rate": 1.158072696534235e-05, + "loss": 0.5512, + "step": 6363, + "task_loss": 0.6877003312110901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4102311432361603, + "epoch": 5.38, + "learning_rate": 1.1574689047216521e-05, + "loss": 0.4554, + "step": 6364, + "task_loss": 0.23337914049625397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3142209053039551, + "epoch": 5.38, + "learning_rate": 1.156865112909069e-05, + "loss": 0.4394, + "step": 6365, + "task_loss": 0.10010781139135361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5791646838188171, + "epoch": 5.38, + "learning_rate": 1.1562613210964859e-05, + "loss": 0.4586, + "step": 6366, + "task_loss": 0.16384665668010712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6964355707168579, + "epoch": 5.38, + "learning_rate": 1.155657529283903e-05, + "loss": 0.5514, + "step": 6367, + "task_loss": 0.7484047412872314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.328196257352829, + "epoch": 5.38, + "learning_rate": 1.15505373747132e-05, + "loss": 0.44, + "step": 6368, + "task_loss": 0.5148732662200928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49885568022727966, + "epoch": 5.38, + "learning_rate": 1.1544499456587369e-05, + "loss": 0.4583, + "step": 6369, + "task_loss": 0.20545554161071777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2769629955291748, + "epoch": 5.38, + "learning_rate": 1.153846153846154e-05, + "loss": 0.4269, + "step": 6370, + "task_loss": 0.3381071090698242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5010702013969421, + "epoch": 5.39, + "learning_rate": 1.1532423620335708e-05, + "loss": 0.5281, + "step": 6371, + "task_loss": 0.8603624105453491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2136225402355194, + "epoch": 5.39, + "learning_rate": 1.1526385702209879e-05, + "loss": 0.4241, + "step": 6372, + "task_loss": 0.34543347358703613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42111849784851074, + "epoch": 5.39, + "learning_rate": 1.1520347784084048e-05, + "loss": 0.4015, + "step": 6373, + "task_loss": 0.45545148849487305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7068221569061279, + "epoch": 5.39, + "learning_rate": 1.1514309865958218e-05, + "loss": 0.6417, + "step": 6374, + "task_loss": 0.7508832216262817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36807018518447876, + "epoch": 5.39, + "learning_rate": 1.1508271947832389e-05, + "loss": 0.3598, + "step": 6375, + "task_loss": 0.4108993113040924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28906649351119995, + "epoch": 5.39, + "learning_rate": 1.1502234029706558e-05, + "loss": 0.3821, + "step": 6376, + "task_loss": 0.5109365582466125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41303277015686035, + "epoch": 5.39, + "learning_rate": 1.1496196111580727e-05, + "loss": 0.3843, + "step": 6377, + "task_loss": 0.6203212738037109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32407620549201965, + "epoch": 5.39, + "learning_rate": 1.1490158193454897e-05, + "loss": 0.4541, + "step": 6378, + "task_loss": 0.3936872184276581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6017963290214539, + "epoch": 5.39, + "learning_rate": 1.1484120275329068e-05, + "loss": 0.471, + "step": 6379, + "task_loss": 0.6045811176300049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41324788331985474, + "epoch": 5.39, + "learning_rate": 1.1478082357203237e-05, + "loss": 0.4025, + "step": 6380, + "task_loss": 1.426512360572815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39666372537612915, + "epoch": 5.39, + "learning_rate": 1.1472044439077406e-05, + "loss": 0.4812, + "step": 6381, + "task_loss": 0.386556476354599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40040332078933716, + "epoch": 5.39, + "learning_rate": 1.1466006520951576e-05, + "loss": 0.3474, + "step": 6382, + "task_loss": 0.3610733449459076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42494675517082214, + "epoch": 5.4, + "learning_rate": 1.1459968602825747e-05, + "loss": 0.4523, + "step": 6383, + "task_loss": 0.47069215774536133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5623818039894104, + "epoch": 5.4, + "learning_rate": 1.1453930684699916e-05, + "loss": 0.4397, + "step": 6384, + "task_loss": 0.7198392152786255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5985987782478333, + "epoch": 5.4, + "learning_rate": 1.1447892766574086e-05, + "loss": 0.4971, + "step": 6385, + "task_loss": 0.786270260810852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2824723422527313, + "epoch": 5.4, + "learning_rate": 1.1441854848448255e-05, + "loss": 0.4207, + "step": 6386, + "task_loss": 0.0542759969830513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33957523107528687, + "epoch": 5.4, + "learning_rate": 1.1435816930322426e-05, + "loss": 0.4801, + "step": 6387, + "task_loss": 0.5397275686264038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6089708805084229, + "epoch": 5.4, + "learning_rate": 1.1429779012196595e-05, + "loss": 0.5876, + "step": 6388, + "task_loss": 0.6468815803527832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5920357704162598, + "epoch": 5.4, + "learning_rate": 1.1423741094070765e-05, + "loss": 0.5292, + "step": 6389, + "task_loss": 0.25523439049720764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42629921436309814, + "epoch": 5.4, + "learning_rate": 1.1417703175944934e-05, + "loss": 0.4412, + "step": 6390, + "task_loss": 1.1363967657089233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4772029519081116, + "epoch": 5.4, + "learning_rate": 1.1411665257819105e-05, + "loss": 0.4764, + "step": 6391, + "task_loss": 0.7943968772888184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.439319372177124, + "epoch": 5.4, + "learning_rate": 1.1405627339693273e-05, + "loss": 0.4985, + "step": 6392, + "task_loss": 1.0646936893463135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4251845180988312, + "epoch": 5.4, + "learning_rate": 1.1399589421567444e-05, + "loss": 0.5704, + "step": 6393, + "task_loss": 0.3912878930568695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36972397565841675, + "epoch": 5.4, + "learning_rate": 1.1393551503441615e-05, + "loss": 0.488, + "step": 6394, + "task_loss": 0.46971622109413147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45394986867904663, + "epoch": 5.41, + "learning_rate": 1.1387513585315783e-05, + "loss": 0.381, + "step": 6395, + "task_loss": 0.629061758518219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48511433601379395, + "epoch": 5.41, + "learning_rate": 1.1381475667189952e-05, + "loss": 0.4789, + "step": 6396, + "task_loss": 1.0026973485946655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27677080035209656, + "epoch": 5.41, + "learning_rate": 1.1375437749064123e-05, + "loss": 0.4603, + "step": 6397, + "task_loss": 0.7759247422218323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44869422912597656, + "epoch": 5.41, + "learning_rate": 1.1369399830938294e-05, + "loss": 0.3764, + "step": 6398, + "task_loss": 0.14486223459243774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4511607885360718, + "epoch": 5.41, + "learning_rate": 1.1363361912812464e-05, + "loss": 0.4461, + "step": 6399, + "task_loss": 0.7577351331710815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39995741844177246, + "epoch": 5.41, + "learning_rate": 1.1357323994686631e-05, + "loss": 0.4549, + "step": 6400, + "task_loss": 0.6534932851791382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3420477509498596, + "epoch": 5.41, + "learning_rate": 1.1351286076560802e-05, + "loss": 0.4871, + "step": 6401, + "task_loss": 0.5055237412452698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3617399334907532, + "epoch": 5.41, + "learning_rate": 1.1345248158434972e-05, + "loss": 0.5097, + "step": 6402, + "task_loss": 0.420353502035141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39528170228004456, + "epoch": 5.41, + "learning_rate": 1.1339210240309143e-05, + "loss": 0.5112, + "step": 6403, + "task_loss": 0.7756186127662659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45848339796066284, + "epoch": 5.41, + "learning_rate": 1.1333172322183312e-05, + "loss": 0.4251, + "step": 6404, + "task_loss": 0.7465702295303345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4680298864841461, + "epoch": 5.41, + "learning_rate": 1.132713440405748e-05, + "loss": 0.4385, + "step": 6405, + "task_loss": 0.24607042968273163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36823293566703796, + "epoch": 5.41, + "learning_rate": 1.1321096485931651e-05, + "loss": 0.5135, + "step": 6406, + "task_loss": 0.603877067565918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5664137005805969, + "epoch": 5.42, + "learning_rate": 1.131505856780582e-05, + "loss": 0.6184, + "step": 6407, + "task_loss": 0.7587255239486694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5582165718078613, + "epoch": 5.42, + "learning_rate": 1.130902064967999e-05, + "loss": 0.3506, + "step": 6408, + "task_loss": 0.443971186876297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3768201470375061, + "epoch": 5.42, + "learning_rate": 1.1302982731554161e-05, + "loss": 0.3418, + "step": 6409, + "task_loss": 0.04182751476764679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.529041051864624, + "epoch": 5.42, + "learning_rate": 1.129694481342833e-05, + "loss": 0.4336, + "step": 6410, + "task_loss": 1.1429561376571655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6748365163803101, + "epoch": 5.42, + "learning_rate": 1.1290906895302499e-05, + "loss": 0.5418, + "step": 6411, + "task_loss": 0.8384679555892944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7615485191345215, + "epoch": 5.42, + "learning_rate": 1.128486897717667e-05, + "loss": 0.611, + "step": 6412, + "task_loss": 1.0083147287368774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49328261613845825, + "epoch": 5.42, + "learning_rate": 1.127883105905084e-05, + "loss": 0.396, + "step": 6413, + "task_loss": 0.8322123289108276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6751197576522827, + "epoch": 5.42, + "learning_rate": 1.127279314092501e-05, + "loss": 0.4995, + "step": 6414, + "task_loss": 1.2465537786483765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42600181698799133, + "epoch": 5.42, + "learning_rate": 1.1266755222799178e-05, + "loss": 0.49, + "step": 6415, + "task_loss": 0.34879347681999207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45959609746932983, + "epoch": 5.42, + "learning_rate": 1.1260717304673349e-05, + "loss": 0.5502, + "step": 6416, + "task_loss": 1.971940279006958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4291689991950989, + "epoch": 5.42, + "learning_rate": 1.1254679386547519e-05, + "loss": 0.5159, + "step": 6417, + "task_loss": 0.9801028370857239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4734179973602295, + "epoch": 5.42, + "learning_rate": 1.124864146842169e-05, + "loss": 0.3783, + "step": 6418, + "task_loss": 0.4718081057071686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3216676414012909, + "epoch": 5.43, + "learning_rate": 1.1242603550295859e-05, + "loss": 0.3321, + "step": 6419, + "task_loss": 0.44735509157180786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32978951930999756, + "epoch": 5.43, + "learning_rate": 1.1236565632170027e-05, + "loss": 0.4263, + "step": 6420, + "task_loss": 0.12534235417842865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35204386711120605, + "epoch": 5.43, + "learning_rate": 1.1230527714044198e-05, + "loss": 0.4164, + "step": 6421, + "task_loss": 0.8288156986236572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6068124175071716, + "epoch": 5.43, + "learning_rate": 1.1224489795918369e-05, + "loss": 0.4998, + "step": 6422, + "task_loss": 0.31855127215385437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4088638126850128, + "epoch": 5.43, + "learning_rate": 1.1218451877792537e-05, + "loss": 0.6106, + "step": 6423, + "task_loss": 0.5528804659843445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3809813857078552, + "epoch": 5.43, + "learning_rate": 1.1212413959666708e-05, + "loss": 0.4612, + "step": 6424, + "task_loss": 1.1572620868682861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41128963232040405, + "epoch": 5.43, + "learning_rate": 1.1206376041540877e-05, + "loss": 0.4602, + "step": 6425, + "task_loss": 0.25753000378608704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35183602571487427, + "epoch": 5.43, + "learning_rate": 1.1200338123415047e-05, + "loss": 0.5187, + "step": 6426, + "task_loss": 0.5737191438674927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6274593472480774, + "epoch": 5.43, + "learning_rate": 1.1194300205289216e-05, + "loss": 0.579, + "step": 6427, + "task_loss": 0.6048048734664917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34852197766304016, + "epoch": 5.43, + "learning_rate": 1.1188262287163387e-05, + "loss": 0.4036, + "step": 6428, + "task_loss": 0.18281786143779755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.648535966873169, + "epoch": 5.43, + "learning_rate": 1.1182224369037557e-05, + "loss": 0.5554, + "step": 6429, + "task_loss": 0.40111470222473145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49604859948158264, + "epoch": 5.44, + "learning_rate": 1.1176186450911726e-05, + "loss": 0.4556, + "step": 6430, + "task_loss": 0.2758389413356781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.320651650428772, + "epoch": 5.44, + "learning_rate": 1.1170148532785895e-05, + "loss": 0.4658, + "step": 6431, + "task_loss": 0.13978277146816254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2970947325229645, + "epoch": 5.44, + "learning_rate": 1.1164110614660066e-05, + "loss": 0.3977, + "step": 6432, + "task_loss": 0.14437319338321686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38155215978622437, + "epoch": 5.44, + "learning_rate": 1.1158072696534236e-05, + "loss": 0.4388, + "step": 6433, + "task_loss": 0.4137100875377655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35221874713897705, + "epoch": 5.44, + "learning_rate": 1.1152034778408405e-05, + "loss": 0.4908, + "step": 6434, + "task_loss": 0.7819401621818542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5120300054550171, + "epoch": 5.44, + "learning_rate": 1.1145996860282574e-05, + "loss": 0.4366, + "step": 6435, + "task_loss": 0.8740734457969666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3415496349334717, + "epoch": 5.44, + "learning_rate": 1.1139958942156745e-05, + "loss": 0.3521, + "step": 6436, + "task_loss": 0.7334772348403931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37128493189811707, + "epoch": 5.44, + "learning_rate": 1.1133921024030915e-05, + "loss": 0.4353, + "step": 6437, + "task_loss": 0.7957016229629517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1981428563594818, + "epoch": 5.44, + "learning_rate": 1.1127883105905084e-05, + "loss": 0.3754, + "step": 6438, + "task_loss": 0.053139738738536835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4762312173843384, + "epoch": 5.44, + "learning_rate": 1.1121845187779255e-05, + "loss": 0.4237, + "step": 6439, + "task_loss": 0.0740610733628273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7873530387878418, + "epoch": 5.44, + "learning_rate": 1.1115807269653424e-05, + "loss": 0.5736, + "step": 6440, + "task_loss": 0.8475521206855774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40703994035720825, + "epoch": 5.44, + "learning_rate": 1.1109769351527594e-05, + "loss": 0.4414, + "step": 6441, + "task_loss": 1.3802993297576904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.345816969871521, + "epoch": 5.45, + "learning_rate": 1.1103731433401763e-05, + "loss": 0.3743, + "step": 6442, + "task_loss": 0.8764696717262268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7445118427276611, + "epoch": 5.45, + "learning_rate": 1.1097693515275934e-05, + "loss": 0.5509, + "step": 6443, + "task_loss": 0.37660279870033264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5909204483032227, + "epoch": 5.45, + "learning_rate": 1.1091655597150104e-05, + "loss": 0.4894, + "step": 6444, + "task_loss": 1.2657946348190308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3718975782394409, + "epoch": 5.45, + "learning_rate": 1.1085617679024273e-05, + "loss": 0.4873, + "step": 6445, + "task_loss": 0.501110315322876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33142584562301636, + "epoch": 5.45, + "learning_rate": 1.1079579760898442e-05, + "loss": 0.559, + "step": 6446, + "task_loss": 0.7191939949989319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.611814022064209, + "epoch": 5.45, + "learning_rate": 1.1073541842772613e-05, + "loss": 0.578, + "step": 6447, + "task_loss": 0.268498957157135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3986048400402069, + "epoch": 5.45, + "learning_rate": 1.1067503924646783e-05, + "loss": 0.5072, + "step": 6448, + "task_loss": 0.48831111192703247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5388194918632507, + "epoch": 5.45, + "learning_rate": 1.1061466006520952e-05, + "loss": 0.5412, + "step": 6449, + "task_loss": 0.5628845691680908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 1.0428508520126343, + "epoch": 5.45, + "learning_rate": 1.1055428088395121e-05, + "loss": 0.5683, + "step": 6450, + "task_loss": 1.226529598236084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4304138422012329, + "epoch": 5.45, + "learning_rate": 1.1049390170269291e-05, + "loss": 0.5456, + "step": 6451, + "task_loss": 0.5530934929847717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2857217788696289, + "epoch": 5.45, + "learning_rate": 1.1043352252143462e-05, + "loss": 0.341, + "step": 6452, + "task_loss": 0.3938648998737335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27217262983322144, + "epoch": 5.45, + "learning_rate": 1.1037314334017631e-05, + "loss": 0.3429, + "step": 6453, + "task_loss": 0.8099424839019775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6063569784164429, + "epoch": 5.46, + "learning_rate": 1.1031276415891801e-05, + "loss": 0.5162, + "step": 6454, + "task_loss": 0.9602629542350769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4714534878730774, + "epoch": 5.46, + "learning_rate": 1.102523849776597e-05, + "loss": 0.5237, + "step": 6455, + "task_loss": 0.34096571803092957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48314720392227173, + "epoch": 5.46, + "learning_rate": 1.1019200579640141e-05, + "loss": 0.5024, + "step": 6456, + "task_loss": 0.4365038275718689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2683447003364563, + "epoch": 5.46, + "learning_rate": 1.101316266151431e-05, + "loss": 0.4358, + "step": 6457, + "task_loss": 0.5945911407470703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8255869150161743, + "epoch": 5.46, + "learning_rate": 1.100712474338848e-05, + "loss": 0.4642, + "step": 6458, + "task_loss": 0.9104630351066589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40880635380744934, + "epoch": 5.46, + "learning_rate": 1.1001086825262651e-05, + "loss": 0.4075, + "step": 6459, + "task_loss": 0.377916544675827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43631649017333984, + "epoch": 5.46, + "learning_rate": 1.099504890713682e-05, + "loss": 0.4785, + "step": 6460, + "task_loss": 0.5686953663825989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3980233371257782, + "epoch": 5.46, + "learning_rate": 1.0989010989010989e-05, + "loss": 0.4119, + "step": 6461, + "task_loss": 1.5459638833999634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38641515374183655, + "epoch": 5.46, + "learning_rate": 1.098297307088516e-05, + "loss": 0.4423, + "step": 6462, + "task_loss": 0.33670857548713684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.327347069978714, + "epoch": 5.46, + "learning_rate": 1.097693515275933e-05, + "loss": 0.4952, + "step": 6463, + "task_loss": 0.5231347680091858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.621661365032196, + "epoch": 5.46, + "learning_rate": 1.09708972346335e-05, + "loss": 0.4326, + "step": 6464, + "task_loss": 1.2433968782424927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2716473639011383, + "epoch": 5.46, + "learning_rate": 1.0964859316507668e-05, + "loss": 0.4567, + "step": 6465, + "task_loss": 1.1938037872314453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31608444452285767, + "epoch": 5.47, + "learning_rate": 1.0958821398381838e-05, + "loss": 0.4297, + "step": 6466, + "task_loss": 0.24395181238651276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39739203453063965, + "epoch": 5.47, + "learning_rate": 1.0952783480256009e-05, + "loss": 0.5702, + "step": 6467, + "task_loss": 1.445815086364746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31090232729911804, + "epoch": 5.47, + "learning_rate": 1.094674556213018e-05, + "loss": 0.4071, + "step": 6468, + "task_loss": 0.2702001929283142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9702149629592896, + "epoch": 5.47, + "learning_rate": 1.0940707644004348e-05, + "loss": 0.6727, + "step": 6469, + "task_loss": 1.1243218183517456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25207996368408203, + "epoch": 5.47, + "learning_rate": 1.0934669725878517e-05, + "loss": 0.3598, + "step": 6470, + "task_loss": 0.2202354520559311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7336540818214417, + "epoch": 5.47, + "learning_rate": 1.0928631807752688e-05, + "loss": 0.5804, + "step": 6471, + "task_loss": 1.0822430849075317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3480544984340668, + "epoch": 5.47, + "learning_rate": 1.0922593889626856e-05, + "loss": 0.5099, + "step": 6472, + "task_loss": 0.34024932980537415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4002733826637268, + "epoch": 5.47, + "learning_rate": 1.0916555971501027e-05, + "loss": 0.3996, + "step": 6473, + "task_loss": 1.1772016286849976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33383792638778687, + "epoch": 5.47, + "learning_rate": 1.0910518053375198e-05, + "loss": 0.4986, + "step": 6474, + "task_loss": 0.91908860206604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4257444143295288, + "epoch": 5.47, + "learning_rate": 1.0904480135249366e-05, + "loss": 0.4201, + "step": 6475, + "task_loss": 0.7394271492958069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3046905994415283, + "epoch": 5.47, + "learning_rate": 1.0898442217123535e-05, + "loss": 0.4473, + "step": 6476, + "task_loss": 0.6722404360771179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5753474235534668, + "epoch": 5.47, + "learning_rate": 1.0892404298997706e-05, + "loss": 0.5838, + "step": 6477, + "task_loss": 0.8624971508979797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5037566423416138, + "epoch": 5.48, + "learning_rate": 1.0886366380871877e-05, + "loss": 0.5519, + "step": 6478, + "task_loss": 1.0764614343643188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.333042174577713, + "epoch": 5.48, + "learning_rate": 1.0880328462746045e-05, + "loss": 0.3867, + "step": 6479, + "task_loss": 0.2359268218278885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30141475796699524, + "epoch": 5.48, + "learning_rate": 1.0874290544620214e-05, + "loss": 0.3965, + "step": 6480, + "task_loss": 0.25370681285858154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3596739172935486, + "epoch": 5.48, + "learning_rate": 1.0868252626494385e-05, + "loss": 0.3962, + "step": 6481, + "task_loss": 0.49250516295433044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2981621026992798, + "epoch": 5.48, + "learning_rate": 1.0862214708368555e-05, + "loss": 0.4751, + "step": 6482, + "task_loss": 0.5364390015602112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40946996212005615, + "epoch": 5.48, + "learning_rate": 1.0856176790242726e-05, + "loss": 0.5556, + "step": 6483, + "task_loss": 1.1326842308044434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3737828731536865, + "epoch": 5.48, + "learning_rate": 1.0850138872116893e-05, + "loss": 0.4095, + "step": 6484, + "task_loss": 0.8820573091506958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36158227920532227, + "epoch": 5.48, + "learning_rate": 1.0844100953991064e-05, + "loss": 0.3872, + "step": 6485, + "task_loss": 0.9013544321060181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38583821058273315, + "epoch": 5.48, + "learning_rate": 1.0838063035865234e-05, + "loss": 0.5921, + "step": 6486, + "task_loss": 0.27379173040390015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48716580867767334, + "epoch": 5.48, + "learning_rate": 1.0832025117739405e-05, + "loss": 0.455, + "step": 6487, + "task_loss": 0.6701850891113281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3052396774291992, + "epoch": 5.48, + "learning_rate": 1.0825987199613574e-05, + "loss": 0.4788, + "step": 6488, + "task_loss": 0.0638214722275734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5256896018981934, + "epoch": 5.48, + "learning_rate": 1.0819949281487743e-05, + "loss": 0.4606, + "step": 6489, + "task_loss": 1.0799148082733154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4381248354911804, + "epoch": 5.49, + "learning_rate": 1.0813911363361913e-05, + "loss": 0.3705, + "step": 6490, + "task_loss": 0.5858855247497559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3847799301147461, + "epoch": 5.49, + "learning_rate": 1.0807873445236084e-05, + "loss": 0.4466, + "step": 6491, + "task_loss": 1.9758387804031372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4201732277870178, + "epoch": 5.49, + "learning_rate": 1.0801835527110253e-05, + "loss": 0.4744, + "step": 6492, + "task_loss": 0.5729876756668091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7057101130485535, + "epoch": 5.49, + "learning_rate": 1.0795797608984423e-05, + "loss": 0.6407, + "step": 6493, + "task_loss": 0.5777044296264648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21219275891780853, + "epoch": 5.49, + "learning_rate": 1.0789759690858592e-05, + "loss": 0.3787, + "step": 6494, + "task_loss": 0.7489749193191528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33288317918777466, + "epoch": 5.49, + "learning_rate": 1.0783721772732763e-05, + "loss": 0.4137, + "step": 6495, + "task_loss": 0.3771737813949585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6851563453674316, + "epoch": 5.49, + "learning_rate": 1.0777683854606932e-05, + "loss": 0.5772, + "step": 6496, + "task_loss": 1.367717981338501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.503210186958313, + "epoch": 5.49, + "learning_rate": 1.0771645936481102e-05, + "loss": 0.4328, + "step": 6497, + "task_loss": 0.4273316264152527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32794228196144104, + "epoch": 5.49, + "learning_rate": 1.0765608018355273e-05, + "loss": 0.4324, + "step": 6498, + "task_loss": 1.2285724878311157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8235206007957458, + "epoch": 5.49, + "learning_rate": 1.0759570100229442e-05, + "loss": 0.589, + "step": 6499, + "task_loss": 1.5433768033981323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5095159411430359, + "epoch": 5.49, + "learning_rate": 1.075353218210361e-05, + "loss": 0.3995, + "step": 6500, + "task_loss": 0.7390114068984985 + }, + { + "epoch": 5.49, + "eval_accuracy": 0.9094653465346535, + "eval_loss": 0.29315459728240967, + "eval_runtime": 227.6647, + "eval_samples_per_second": 110.909, + "eval_steps_per_second": 0.87, + "step": 6500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.267596960067749, + "epoch": 5.5, + "learning_rate": 1.0747494263977781e-05, + "loss": 0.3898, + "step": 6501, + "task_loss": 0.4630683958530426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4312867224216461, + "epoch": 5.5, + "learning_rate": 1.0741456345851952e-05, + "loss": 0.3905, + "step": 6502, + "task_loss": 0.8280001878738403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.307012140750885, + "epoch": 5.5, + "learning_rate": 1.073541842772612e-05, + "loss": 0.486, + "step": 6503, + "task_loss": 0.2967236340045929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5807651877403259, + "epoch": 5.5, + "learning_rate": 1.072938050960029e-05, + "loss": 0.4496, + "step": 6504, + "task_loss": 0.767183780670166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6329793930053711, + "epoch": 5.5, + "learning_rate": 1.072334259147446e-05, + "loss": 0.4512, + "step": 6505, + "task_loss": 0.6327988505363464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46675875782966614, + "epoch": 5.5, + "learning_rate": 1.071730467334863e-05, + "loss": 0.4581, + "step": 6506, + "task_loss": 1.4379515647888184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3040703237056732, + "epoch": 5.5, + "learning_rate": 1.07112667552228e-05, + "loss": 0.402, + "step": 6507, + "task_loss": 0.6056128740310669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41238200664520264, + "epoch": 5.5, + "learning_rate": 1.070522883709697e-05, + "loss": 0.4144, + "step": 6508, + "task_loss": 0.17623336613178253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5403857231140137, + "epoch": 5.5, + "learning_rate": 1.0699190918971139e-05, + "loss": 0.5543, + "step": 6509, + "task_loss": 0.4151618778705597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3762030005455017, + "epoch": 5.5, + "learning_rate": 1.069315300084531e-05, + "loss": 0.3474, + "step": 6510, + "task_loss": 0.42921024560928345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3554438650608063, + "epoch": 5.5, + "learning_rate": 1.0687115082719478e-05, + "loss": 0.3935, + "step": 6511, + "task_loss": 0.4663337767124176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33299189805984497, + "epoch": 5.5, + "learning_rate": 1.0681077164593649e-05, + "loss": 0.5027, + "step": 6512, + "task_loss": 0.9205884337425232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4028089940547943, + "epoch": 5.51, + "learning_rate": 1.067503924646782e-05, + "loss": 0.4574, + "step": 6513, + "task_loss": 0.6021021008491516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2891165614128113, + "epoch": 5.51, + "learning_rate": 1.0669001328341988e-05, + "loss": 0.507, + "step": 6514, + "task_loss": 0.6659336090087891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23118877410888672, + "epoch": 5.51, + "learning_rate": 1.0662963410216157e-05, + "loss": 0.3977, + "step": 6515, + "task_loss": 0.6170550584793091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3620295524597168, + "epoch": 5.51, + "learning_rate": 1.0656925492090328e-05, + "loss": 0.3807, + "step": 6516, + "task_loss": 0.15695656836032867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6828116178512573, + "epoch": 5.51, + "learning_rate": 1.0650887573964498e-05, + "loss": 0.4686, + "step": 6517, + "task_loss": 1.088202714920044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7250940203666687, + "epoch": 5.51, + "learning_rate": 1.0644849655838667e-05, + "loss": 0.4492, + "step": 6518, + "task_loss": 0.3931708335876465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41655704379081726, + "epoch": 5.51, + "learning_rate": 1.0638811737712836e-05, + "loss": 0.5239, + "step": 6519, + "task_loss": 0.5539785027503967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48181748390197754, + "epoch": 5.51, + "learning_rate": 1.0632773819587007e-05, + "loss": 0.4251, + "step": 6520, + "task_loss": 0.7910305261611938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5616582632064819, + "epoch": 5.51, + "learning_rate": 1.0626735901461177e-05, + "loss": 0.5108, + "step": 6521, + "task_loss": 0.5756950378417969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28778934478759766, + "epoch": 5.51, + "learning_rate": 1.0620697983335346e-05, + "loss": 0.3888, + "step": 6522, + "task_loss": 0.5771900415420532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5263245105743408, + "epoch": 5.51, + "learning_rate": 1.0614660065209517e-05, + "loss": 0.4604, + "step": 6523, + "task_loss": 0.7063573002815247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38132530450820923, + "epoch": 5.51, + "learning_rate": 1.0608622147083686e-05, + "loss": 0.502, + "step": 6524, + "task_loss": 0.31479889154434204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5895600318908691, + "epoch": 5.52, + "learning_rate": 1.0602584228957856e-05, + "loss": 0.5172, + "step": 6525, + "task_loss": 1.113694667816162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38343244791030884, + "epoch": 5.52, + "learning_rate": 1.0596546310832025e-05, + "loss": 0.3989, + "step": 6526, + "task_loss": 0.4916403293609619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25377658009529114, + "epoch": 5.52, + "learning_rate": 1.0590508392706196e-05, + "loss": 0.3731, + "step": 6527, + "task_loss": 0.33476370573043823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6367818713188171, + "epoch": 5.52, + "learning_rate": 1.0584470474580366e-05, + "loss": 0.5268, + "step": 6528, + "task_loss": 0.7910555005073547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5747889280319214, + "epoch": 5.52, + "learning_rate": 1.0578432556454535e-05, + "loss": 0.4819, + "step": 6529, + "task_loss": 0.37968146800994873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45716729760169983, + "epoch": 5.52, + "learning_rate": 1.0572394638328704e-05, + "loss": 0.4448, + "step": 6530, + "task_loss": 0.7606426477432251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6061018109321594, + "epoch": 5.52, + "learning_rate": 1.0566356720202874e-05, + "loss": 0.5286, + "step": 6531, + "task_loss": 0.9126724600791931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3092736303806305, + "epoch": 5.52, + "learning_rate": 1.0560318802077045e-05, + "loss": 0.3901, + "step": 6532, + "task_loss": 0.4011595845222473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46810585260391235, + "epoch": 5.52, + "learning_rate": 1.0554280883951216e-05, + "loss": 0.4159, + "step": 6533, + "task_loss": 1.1186864376068115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.15495768189430237, + "epoch": 5.52, + "learning_rate": 1.0548242965825383e-05, + "loss": 0.3195, + "step": 6534, + "task_loss": 0.013231070712208748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.547240674495697, + "epoch": 5.52, + "learning_rate": 1.0542205047699553e-05, + "loss": 0.4593, + "step": 6535, + "task_loss": 0.8312302827835083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31567081809043884, + "epoch": 5.52, + "learning_rate": 1.0536167129573724e-05, + "loss": 0.5383, + "step": 6536, + "task_loss": 0.31786632537841797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49665123224258423, + "epoch": 5.53, + "learning_rate": 1.0530129211447893e-05, + "loss": 0.4475, + "step": 6537, + "task_loss": 0.45344409346580505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37035349011421204, + "epoch": 5.53, + "learning_rate": 1.0524091293322063e-05, + "loss": 0.5358, + "step": 6538, + "task_loss": 0.8119237422943115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49556565284729004, + "epoch": 5.53, + "learning_rate": 1.0518053375196232e-05, + "loss": 0.462, + "step": 6539, + "task_loss": 0.4232543706893921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8214705586433411, + "epoch": 5.53, + "learning_rate": 1.0512015457070403e-05, + "loss": 0.3938, + "step": 6540, + "task_loss": 1.2616276741027832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7333650588989258, + "epoch": 5.53, + "learning_rate": 1.0505977538944572e-05, + "loss": 0.5687, + "step": 6541, + "task_loss": 1.2309322357177734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43745866417884827, + "epoch": 5.53, + "learning_rate": 1.0499939620818742e-05, + "loss": 0.3518, + "step": 6542, + "task_loss": 0.2668023705482483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4480666518211365, + "epoch": 5.53, + "learning_rate": 1.0493901702692913e-05, + "loss": 0.4819, + "step": 6543, + "task_loss": 0.39329561591148376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6971666812896729, + "epoch": 5.53, + "learning_rate": 1.0487863784567082e-05, + "loss": 0.4724, + "step": 6544, + "task_loss": 1.146660327911377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.296968936920166, + "epoch": 5.53, + "learning_rate": 1.048182586644125e-05, + "loss": 0.4326, + "step": 6545, + "task_loss": 0.5446733832359314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3317286968231201, + "epoch": 5.53, + "learning_rate": 1.0475787948315421e-05, + "loss": 0.4223, + "step": 6546, + "task_loss": 1.5151302814483643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22501042485237122, + "epoch": 5.53, + "learning_rate": 1.0469750030189592e-05, + "loss": 0.4854, + "step": 6547, + "task_loss": 0.3349217176437378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4007885456085205, + "epoch": 5.53, + "learning_rate": 1.0463712112063762e-05, + "loss": 0.3737, + "step": 6548, + "task_loss": 0.1894521415233612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6889320611953735, + "epoch": 5.54, + "learning_rate": 1.045767419393793e-05, + "loss": 0.499, + "step": 6549, + "task_loss": 0.9466208219528198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5189695358276367, + "epoch": 5.54, + "learning_rate": 1.04516362758121e-05, + "loss": 0.4881, + "step": 6550, + "task_loss": 1.6424872875213623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36308836936950684, + "epoch": 5.54, + "learning_rate": 1.044559835768627e-05, + "loss": 0.3585, + "step": 6551, + "task_loss": 1.2149078845977783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24555683135986328, + "epoch": 5.54, + "learning_rate": 1.0439560439560441e-05, + "loss": 0.3371, + "step": 6552, + "task_loss": 0.6002638936042786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46089375019073486, + "epoch": 5.54, + "learning_rate": 1.043352252143461e-05, + "loss": 0.542, + "step": 6553, + "task_loss": 0.7457107305526733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46785590052604675, + "epoch": 5.54, + "learning_rate": 1.0427484603308779e-05, + "loss": 0.4378, + "step": 6554, + "task_loss": 0.6289100050926208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5258687734603882, + "epoch": 5.54, + "learning_rate": 1.042144668518295e-05, + "loss": 0.5046, + "step": 6555, + "task_loss": 1.0865079164505005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28771448135375977, + "epoch": 5.54, + "learning_rate": 1.041540876705712e-05, + "loss": 0.4324, + "step": 6556, + "task_loss": 0.5617824792861938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4335520565509796, + "epoch": 5.54, + "learning_rate": 1.0409370848931289e-05, + "loss": 0.4145, + "step": 6557, + "task_loss": 0.781493067741394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40867680311203003, + "epoch": 5.54, + "learning_rate": 1.040333293080546e-05, + "loss": 0.4504, + "step": 6558, + "task_loss": 0.7835278511047363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34899264574050903, + "epoch": 5.54, + "learning_rate": 1.0397295012679628e-05, + "loss": 0.3902, + "step": 6559, + "task_loss": 0.5776785612106323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7723253965377808, + "epoch": 5.54, + "learning_rate": 1.0391257094553799e-05, + "loss": 0.4866, + "step": 6560, + "task_loss": 1.3316569328308105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.721788763999939, + "epoch": 5.55, + "learning_rate": 1.0385219176427968e-05, + "loss": 0.466, + "step": 6561, + "task_loss": 0.5291551947593689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21560153365135193, + "epoch": 5.55, + "learning_rate": 1.0379181258302138e-05, + "loss": 0.3585, + "step": 6562, + "task_loss": 0.06305918842554092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39289143681526184, + "epoch": 5.55, + "learning_rate": 1.0373143340176309e-05, + "loss": 0.4994, + "step": 6563, + "task_loss": 0.6463732719421387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24446183443069458, + "epoch": 5.55, + "learning_rate": 1.0367105422050478e-05, + "loss": 0.4623, + "step": 6564, + "task_loss": 0.12131837010383606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2689506709575653, + "epoch": 5.55, + "learning_rate": 1.0361067503924647e-05, + "loss": 0.408, + "step": 6565, + "task_loss": 1.2945120334625244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5966702699661255, + "epoch": 5.55, + "learning_rate": 1.0355029585798817e-05, + "loss": 0.5155, + "step": 6566, + "task_loss": 0.8464288711547852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47356027364730835, + "epoch": 5.55, + "learning_rate": 1.0348991667672988e-05, + "loss": 0.414, + "step": 6567, + "task_loss": 1.1683868169784546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4869329333305359, + "epoch": 5.55, + "learning_rate": 1.0342953749547157e-05, + "loss": 0.4917, + "step": 6568, + "task_loss": 0.6952275037765503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4912315011024475, + "epoch": 5.55, + "learning_rate": 1.0336915831421326e-05, + "loss": 0.4948, + "step": 6569, + "task_loss": 0.23212383687496185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44060713052749634, + "epoch": 5.55, + "learning_rate": 1.0330877913295496e-05, + "loss": 0.3936, + "step": 6570, + "task_loss": 1.5084562301635742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7930140495300293, + "epoch": 5.55, + "learning_rate": 1.0324839995169667e-05, + "loss": 0.5838, + "step": 6571, + "task_loss": 1.8577957153320312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3639751076698303, + "epoch": 5.56, + "learning_rate": 1.0318802077043836e-05, + "loss": 0.404, + "step": 6572, + "task_loss": 1.8433747291564941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5634630918502808, + "epoch": 5.56, + "learning_rate": 1.0312764158918005e-05, + "loss": 0.5731, + "step": 6573, + "task_loss": 0.2750808298587799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32636380195617676, + "epoch": 5.56, + "learning_rate": 1.0306726240792175e-05, + "loss": 0.4074, + "step": 6574, + "task_loss": 0.5030247569084167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5527387261390686, + "epoch": 5.56, + "learning_rate": 1.0300688322666346e-05, + "loss": 0.5011, + "step": 6575, + "task_loss": 0.6462475061416626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2701635956764221, + "epoch": 5.56, + "learning_rate": 1.0294650404540515e-05, + "loss": 0.5192, + "step": 6576, + "task_loss": 0.14141885936260223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3958849012851715, + "epoch": 5.56, + "learning_rate": 1.0288612486414685e-05, + "loss": 0.3827, + "step": 6577, + "task_loss": 0.338423490524292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22311869263648987, + "epoch": 5.56, + "learning_rate": 1.0282574568288854e-05, + "loss": 0.436, + "step": 6578, + "task_loss": 0.6539350152015686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2878463864326477, + "epoch": 5.56, + "learning_rate": 1.0276536650163025e-05, + "loss": 0.364, + "step": 6579, + "task_loss": 0.768605649471283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4917476177215576, + "epoch": 5.56, + "learning_rate": 1.0270498732037193e-05, + "loss": 0.387, + "step": 6580, + "task_loss": 0.326134592294693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36220911145210266, + "epoch": 5.56, + "learning_rate": 1.0264460813911364e-05, + "loss": 0.3874, + "step": 6581, + "task_loss": 0.4775090515613556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7118590474128723, + "epoch": 5.56, + "learning_rate": 1.0258422895785535e-05, + "loss": 0.5283, + "step": 6582, + "task_loss": 1.0879887342453003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39235302805900574, + "epoch": 5.56, + "learning_rate": 1.0252384977659703e-05, + "loss": 0.4324, + "step": 6583, + "task_loss": 0.7971075773239136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2609184682369232, + "epoch": 5.57, + "learning_rate": 1.0246347059533872e-05, + "loss": 0.36, + "step": 6584, + "task_loss": 0.09308464080095291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4079083800315857, + "epoch": 5.57, + "learning_rate": 1.0240309141408043e-05, + "loss": 0.4782, + "step": 6585, + "task_loss": 1.465531587600708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22239530086517334, + "epoch": 5.57, + "learning_rate": 1.0234271223282213e-05, + "loss": 0.3726, + "step": 6586, + "task_loss": 0.5140138864517212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2814912796020508, + "epoch": 5.57, + "learning_rate": 1.0228233305156382e-05, + "loss": 0.3784, + "step": 6587, + "task_loss": 0.19766443967819214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5245600938796997, + "epoch": 5.57, + "learning_rate": 1.0222195387030551e-05, + "loss": 0.4622, + "step": 6588, + "task_loss": 0.8332313299179077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5765300989151001, + "epoch": 5.57, + "learning_rate": 1.0216157468904722e-05, + "loss": 0.6196, + "step": 6589, + "task_loss": 0.5054519176483154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4425010085105896, + "epoch": 5.57, + "learning_rate": 1.0210119550778892e-05, + "loss": 0.4654, + "step": 6590, + "task_loss": 0.7009885907173157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3301137685775757, + "epoch": 5.57, + "learning_rate": 1.0204081632653061e-05, + "loss": 0.4439, + "step": 6591, + "task_loss": 0.4593878984451294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.728769063949585, + "epoch": 5.57, + "learning_rate": 1.0198043714527232e-05, + "loss": 0.4932, + "step": 6592, + "task_loss": 0.666101336479187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47873014211654663, + "epoch": 5.57, + "learning_rate": 1.01920057964014e-05, + "loss": 0.5216, + "step": 6593, + "task_loss": 0.7723004817962646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3291347622871399, + "epoch": 5.57, + "learning_rate": 1.0185967878275571e-05, + "loss": 0.4809, + "step": 6594, + "task_loss": 0.596272349357605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3822205662727356, + "epoch": 5.57, + "learning_rate": 1.017992996014974e-05, + "loss": 0.5021, + "step": 6595, + "task_loss": 0.19459110498428345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3742750883102417, + "epoch": 5.58, + "learning_rate": 1.017389204202391e-05, + "loss": 0.4731, + "step": 6596, + "task_loss": 0.5387371778488159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33698081970214844, + "epoch": 5.58, + "learning_rate": 1.0167854123898081e-05, + "loss": 0.4758, + "step": 6597, + "task_loss": 0.16839255392551422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27669286727905273, + "epoch": 5.58, + "learning_rate": 1.016181620577225e-05, + "loss": 0.53, + "step": 6598, + "task_loss": 0.6592541933059692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3184857666492462, + "epoch": 5.58, + "learning_rate": 1.0155778287646419e-05, + "loss": 0.316, + "step": 6599, + "task_loss": 0.15906718373298645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2754586637020111, + "epoch": 5.58, + "learning_rate": 1.014974036952059e-05, + "loss": 0.3402, + "step": 6600, + "task_loss": 0.3642820417881012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6991140842437744, + "epoch": 5.58, + "learning_rate": 1.014370245139476e-05, + "loss": 0.5774, + "step": 6601, + "task_loss": 1.3737553358078003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4560391902923584, + "epoch": 5.58, + "learning_rate": 1.0137664533268929e-05, + "loss": 0.418, + "step": 6602, + "task_loss": 0.39918628334999084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8788506984710693, + "epoch": 5.58, + "learning_rate": 1.0131626615143098e-05, + "loss": 0.5149, + "step": 6603, + "task_loss": 1.6970852613449097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21642425656318665, + "epoch": 5.58, + "learning_rate": 1.0125588697017269e-05, + "loss": 0.4089, + "step": 6604, + "task_loss": 0.41126856207847595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8159054517745972, + "epoch": 5.58, + "learning_rate": 1.0119550778891439e-05, + "loss": 0.5059, + "step": 6605, + "task_loss": 1.5232696533203125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23980490863323212, + "epoch": 5.58, + "learning_rate": 1.0113512860765608e-05, + "loss": 0.3965, + "step": 6606, + "task_loss": 0.1999807357788086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46025875210762024, + "epoch": 5.58, + "learning_rate": 1.0107474942639779e-05, + "loss": 0.4279, + "step": 6607, + "task_loss": 0.39939025044441223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2726353406906128, + "epoch": 5.59, + "learning_rate": 1.0101437024513947e-05, + "loss": 0.5294, + "step": 6608, + "task_loss": 0.1501113921403885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48617076873779297, + "epoch": 5.59, + "learning_rate": 1.0095399106388118e-05, + "loss": 0.5543, + "step": 6609, + "task_loss": 1.4043292999267578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49170196056365967, + "epoch": 5.59, + "learning_rate": 1.0089361188262287e-05, + "loss": 0.485, + "step": 6610, + "task_loss": 0.8918325304985046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.14193212985992432, + "epoch": 5.59, + "learning_rate": 1.0083323270136457e-05, + "loss": 0.3734, + "step": 6611, + "task_loss": 2.095724582672119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40398669242858887, + "epoch": 5.59, + "learning_rate": 1.0077285352010628e-05, + "loss": 0.45, + "step": 6612, + "task_loss": 0.3939119279384613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2947441637516022, + "epoch": 5.59, + "learning_rate": 1.0071247433884797e-05, + "loss": 0.311, + "step": 6613, + "task_loss": 0.6469756364822388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4840250015258789, + "epoch": 5.59, + "learning_rate": 1.0065209515758966e-05, + "loss": 0.4771, + "step": 6614, + "task_loss": 0.6271854639053345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33416247367858887, + "epoch": 5.59, + "learning_rate": 1.0059171597633136e-05, + "loss": 0.4181, + "step": 6615, + "task_loss": 0.9095419645309448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24343633651733398, + "epoch": 5.59, + "learning_rate": 1.0053133679507307e-05, + "loss": 0.3446, + "step": 6616, + "task_loss": 0.9406329393386841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23963600397109985, + "epoch": 5.59, + "learning_rate": 1.0047095761381477e-05, + "loss": 0.3919, + "step": 6617, + "task_loss": 0.16745439171791077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37312281131744385, + "epoch": 5.59, + "learning_rate": 1.0041057843255645e-05, + "loss": 0.3665, + "step": 6618, + "task_loss": 0.1863144040107727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.433851957321167, + "epoch": 5.59, + "learning_rate": 1.0035019925129815e-05, + "loss": 0.3783, + "step": 6619, + "task_loss": 1.056527853012085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4294077754020691, + "epoch": 5.6, + "learning_rate": 1.0028982007003986e-05, + "loss": 0.3986, + "step": 6620, + "task_loss": 0.5657487511634827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5478466153144836, + "epoch": 5.6, + "learning_rate": 1.0022944088878156e-05, + "loss": 0.5003, + "step": 6621, + "task_loss": 0.29477477073669434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4397248327732086, + "epoch": 5.6, + "learning_rate": 1.0016906170752325e-05, + "loss": 0.4485, + "step": 6622, + "task_loss": 1.2294903993606567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5015079975128174, + "epoch": 5.6, + "learning_rate": 1.0010868252626494e-05, + "loss": 0.408, + "step": 6623, + "task_loss": 0.9752113223075867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6867103576660156, + "epoch": 5.6, + "learning_rate": 1.0004830334500665e-05, + "loss": 0.5454, + "step": 6624, + "task_loss": 1.0395159721374512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2731471061706543, + "epoch": 5.6, + "learning_rate": 9.998792416374835e-06, + "loss": 0.5503, + "step": 6625, + "task_loss": 0.033146001398563385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37536829710006714, + "epoch": 5.6, + "learning_rate": 9.992754498249004e-06, + "loss": 0.3884, + "step": 6626, + "task_loss": 0.5605231523513794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40159839391708374, + "epoch": 5.6, + "learning_rate": 9.986716580123175e-06, + "loss": 0.3766, + "step": 6627, + "task_loss": 1.0829672813415527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2612251043319702, + "epoch": 5.6, + "learning_rate": 9.980678661997344e-06, + "loss": 0.5002, + "step": 6628, + "task_loss": 0.23830151557922363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7409428358078003, + "epoch": 5.6, + "learning_rate": 9.974640743871514e-06, + "loss": 0.5063, + "step": 6629, + "task_loss": 0.5875707268714905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2884894609451294, + "epoch": 5.6, + "learning_rate": 9.968602825745683e-06, + "loss": 0.3876, + "step": 6630, + "task_loss": 0.34800511598587036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2978453040122986, + "epoch": 5.6, + "learning_rate": 9.962564907619854e-06, + "loss": 0.4055, + "step": 6631, + "task_loss": 1.5410786867141724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5237534642219543, + "epoch": 5.61, + "learning_rate": 9.956526989494024e-06, + "loss": 0.4129, + "step": 6632, + "task_loss": 0.3374232351779938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27444136142730713, + "epoch": 5.61, + "learning_rate": 9.950489071368193e-06, + "loss": 0.5519, + "step": 6633, + "task_loss": 0.6435171365737915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36407366394996643, + "epoch": 5.61, + "learning_rate": 9.944451153242362e-06, + "loss": 0.539, + "step": 6634, + "task_loss": 0.4501280188560486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2637448310852051, + "epoch": 5.61, + "learning_rate": 9.938413235116533e-06, + "loss": 0.4482, + "step": 6635, + "task_loss": 0.0802384540438652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7030282020568848, + "epoch": 5.61, + "learning_rate": 9.932375316990703e-06, + "loss": 0.5904, + "step": 6636, + "task_loss": 0.9231976270675659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36062759160995483, + "epoch": 5.61, + "learning_rate": 9.926337398864872e-06, + "loss": 0.3176, + "step": 6637, + "task_loss": 0.15527167916297913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39053842425346375, + "epoch": 5.61, + "learning_rate": 9.92029948073904e-06, + "loss": 0.4494, + "step": 6638, + "task_loss": 0.7166573405265808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41911593079566956, + "epoch": 5.61, + "learning_rate": 9.914261562613211e-06, + "loss": 0.49, + "step": 6639, + "task_loss": 0.03881998732686043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33602818846702576, + "epoch": 5.61, + "learning_rate": 9.908223644487382e-06, + "loss": 0.4675, + "step": 6640, + "task_loss": 0.5163300037384033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3858576714992523, + "epoch": 5.61, + "learning_rate": 9.902185726361551e-06, + "loss": 0.3959, + "step": 6641, + "task_loss": 0.6001129746437073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21975365281105042, + "epoch": 5.61, + "learning_rate": 9.896147808235721e-06, + "loss": 0.4421, + "step": 6642, + "task_loss": 0.24245241284370422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23400837182998657, + "epoch": 5.61, + "learning_rate": 9.89010989010989e-06, + "loss": 0.4236, + "step": 6643, + "task_loss": 0.3544316291809082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29419493675231934, + "epoch": 5.62, + "learning_rate": 9.884071971984061e-06, + "loss": 0.4154, + "step": 6644, + "task_loss": 0.06820455193519592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.722952127456665, + "epoch": 5.62, + "learning_rate": 9.87803405385823e-06, + "loss": 0.562, + "step": 6645, + "task_loss": 0.6815069913864136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2777334749698639, + "epoch": 5.62, + "learning_rate": 9.8719961357324e-06, + "loss": 0.3677, + "step": 6646, + "task_loss": 0.19722190499305725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29160311818122864, + "epoch": 5.62, + "learning_rate": 9.865958217606571e-06, + "loss": 0.3664, + "step": 6647, + "task_loss": 1.442821979522705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3852737247943878, + "epoch": 5.62, + "learning_rate": 9.85992029948074e-06, + "loss": 0.4305, + "step": 6648, + "task_loss": 0.22547666728496552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5916048884391785, + "epoch": 5.62, + "learning_rate": 9.853882381354909e-06, + "loss": 0.3556, + "step": 6649, + "task_loss": 0.41200241446495056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46638646721839905, + "epoch": 5.62, + "learning_rate": 9.84784446322908e-06, + "loss": 0.4475, + "step": 6650, + "task_loss": 0.833582878112793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3266313076019287, + "epoch": 5.62, + "learning_rate": 9.84180654510325e-06, + "loss": 0.355, + "step": 6651, + "task_loss": 0.37757450342178345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3368372917175293, + "epoch": 5.62, + "learning_rate": 9.835768626977419e-06, + "loss": 0.3408, + "step": 6652, + "task_loss": 0.5827327370643616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35288769006729126, + "epoch": 5.62, + "learning_rate": 9.829730708851588e-06, + "loss": 0.4475, + "step": 6653, + "task_loss": 0.9397169947624207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39362722635269165, + "epoch": 5.62, + "learning_rate": 9.823692790725758e-06, + "loss": 0.4139, + "step": 6654, + "task_loss": 0.6743589043617249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5709276795387268, + "epoch": 5.63, + "learning_rate": 9.817654872599929e-06, + "loss": 0.5207, + "step": 6655, + "task_loss": 1.3481749296188354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.65226811170578, + "epoch": 5.63, + "learning_rate": 9.811616954474098e-06, + "loss": 0.3978, + "step": 6656, + "task_loss": 1.205975890159607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45755618810653687, + "epoch": 5.63, + "learning_rate": 9.805579036348266e-06, + "loss": 0.5541, + "step": 6657, + "task_loss": 0.5163716673851013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5052016377449036, + "epoch": 5.63, + "learning_rate": 9.799541118222437e-06, + "loss": 0.4759, + "step": 6658, + "task_loss": 0.38544192910194397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25063127279281616, + "epoch": 5.63, + "learning_rate": 9.793503200096608e-06, + "loss": 0.2814, + "step": 6659, + "task_loss": 0.16403862833976746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7950208187103271, + "epoch": 5.63, + "learning_rate": 9.787465281970776e-06, + "loss": 0.5465, + "step": 6660, + "task_loss": 1.6837916374206543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34111353754997253, + "epoch": 5.63, + "learning_rate": 9.781427363844947e-06, + "loss": 0.3591, + "step": 6661, + "task_loss": 0.6819562315940857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6122892498970032, + "epoch": 5.63, + "learning_rate": 9.775389445719116e-06, + "loss": 0.4821, + "step": 6662, + "task_loss": 1.3727117776870728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44599127769470215, + "epoch": 5.63, + "learning_rate": 9.769351527593286e-06, + "loss": 0.3798, + "step": 6663, + "task_loss": 0.7604700922966003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3460988998413086, + "epoch": 5.63, + "learning_rate": 9.763313609467455e-06, + "loss": 0.4216, + "step": 6664, + "task_loss": 1.0671651363372803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4767155945301056, + "epoch": 5.63, + "learning_rate": 9.757275691341626e-06, + "loss": 0.5642, + "step": 6665, + "task_loss": 0.7221386432647705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1909727156162262, + "epoch": 5.63, + "learning_rate": 9.751237773215796e-06, + "loss": 0.3822, + "step": 6666, + "task_loss": 0.5514643788337708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33717846870422363, + "epoch": 5.64, + "learning_rate": 9.745199855089965e-06, + "loss": 0.4513, + "step": 6667, + "task_loss": 0.625873863697052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34607917070388794, + "epoch": 5.64, + "learning_rate": 9.739161936964134e-06, + "loss": 0.4372, + "step": 6668, + "task_loss": 0.3548850119113922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3890557289123535, + "epoch": 5.64, + "learning_rate": 9.733124018838305e-06, + "loss": 0.3306, + "step": 6669, + "task_loss": 0.4444583058357239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4147912859916687, + "epoch": 5.64, + "learning_rate": 9.727086100712475e-06, + "loss": 0.5539, + "step": 6670, + "task_loss": 1.2830756902694702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3511492609977722, + "epoch": 5.64, + "learning_rate": 9.721048182586644e-06, + "loss": 0.4775, + "step": 6671, + "task_loss": 0.5181450247764587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6945109367370605, + "epoch": 5.64, + "learning_rate": 9.715010264460813e-06, + "loss": 0.5289, + "step": 6672, + "task_loss": 0.623112678527832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4633328318595886, + "epoch": 5.64, + "learning_rate": 9.708972346334984e-06, + "loss": 0.4374, + "step": 6673, + "task_loss": 1.3965741395950317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4342568516731262, + "epoch": 5.64, + "learning_rate": 9.702934428209154e-06, + "loss": 0.584, + "step": 6674, + "task_loss": 1.0722990036010742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3214791715145111, + "epoch": 5.64, + "learning_rate": 9.696896510083323e-06, + "loss": 0.3816, + "step": 6675, + "task_loss": 0.23681554198265076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44264864921569824, + "epoch": 5.64, + "learning_rate": 9.690858591957494e-06, + "loss": 0.4181, + "step": 6676, + "task_loss": 0.8178834319114685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5027637481689453, + "epoch": 5.64, + "learning_rate": 9.684820673831663e-06, + "loss": 0.5647, + "step": 6677, + "task_loss": 1.0067014694213867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28282684087753296, + "epoch": 5.64, + "learning_rate": 9.678782755705833e-06, + "loss": 0.3083, + "step": 6678, + "task_loss": 0.23867210745811462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2570165693759918, + "epoch": 5.65, + "learning_rate": 9.672744837580002e-06, + "loss": 0.4248, + "step": 6679, + "task_loss": 0.11859311908483505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3490985035896301, + "epoch": 5.65, + "learning_rate": 9.666706919454173e-06, + "loss": 0.582, + "step": 6680, + "task_loss": 0.6593077182769775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4586387574672699, + "epoch": 5.65, + "learning_rate": 9.660669001328343e-06, + "loss": 0.4267, + "step": 6681, + "task_loss": 0.18626613914966583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29908257722854614, + "epoch": 5.65, + "learning_rate": 9.654631083202512e-06, + "loss": 0.441, + "step": 6682, + "task_loss": 0.5742673277854919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6133557558059692, + "epoch": 5.65, + "learning_rate": 9.648593165076681e-06, + "loss": 0.5211, + "step": 6683, + "task_loss": 0.8323172330856323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3714480400085449, + "epoch": 5.65, + "learning_rate": 9.642555246950852e-06, + "loss": 0.4495, + "step": 6684, + "task_loss": 0.5231508016586304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3637115955352783, + "epoch": 5.65, + "learning_rate": 9.636517328825022e-06, + "loss": 0.3315, + "step": 6685, + "task_loss": 0.44225868582725525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3607504367828369, + "epoch": 5.65, + "learning_rate": 9.630479410699193e-06, + "loss": 0.4525, + "step": 6686, + "task_loss": 0.806952178478241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30264222621917725, + "epoch": 5.65, + "learning_rate": 9.62444149257336e-06, + "loss": 0.3679, + "step": 6687, + "task_loss": 0.3033922016620636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4768761396408081, + "epoch": 5.65, + "learning_rate": 9.61840357444753e-06, + "loss": 0.4562, + "step": 6688, + "task_loss": 0.74522465467453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5096253156661987, + "epoch": 5.65, + "learning_rate": 9.612365656321701e-06, + "loss": 0.4471, + "step": 6689, + "task_loss": 0.8947878479957581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33347612619400024, + "epoch": 5.65, + "learning_rate": 9.606327738195872e-06, + "loss": 0.4003, + "step": 6690, + "task_loss": 0.5078001022338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5866214036941528, + "epoch": 5.66, + "learning_rate": 9.60028982007004e-06, + "loss": 0.6146, + "step": 6691, + "task_loss": 1.1787878274917603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3063579201698303, + "epoch": 5.66, + "learning_rate": 9.59425190194421e-06, + "loss": 0.4343, + "step": 6692, + "task_loss": 1.459625005722046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31216102838516235, + "epoch": 5.66, + "learning_rate": 9.58821398381838e-06, + "loss": 0.4419, + "step": 6693, + "task_loss": 0.4528191387653351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46292898058891296, + "epoch": 5.66, + "learning_rate": 9.58217606569255e-06, + "loss": 0.4824, + "step": 6694, + "task_loss": 0.8653815388679504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5531997084617615, + "epoch": 5.66, + "learning_rate": 9.57613814756672e-06, + "loss": 0.7177, + "step": 6695, + "task_loss": 1.3586716651916504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37402403354644775, + "epoch": 5.66, + "learning_rate": 9.57010022944089e-06, + "loss": 0.3436, + "step": 6696, + "task_loss": 0.41005295515060425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39014309644699097, + "epoch": 5.66, + "learning_rate": 9.564062311315059e-06, + "loss": 0.3683, + "step": 6697, + "task_loss": 0.8219631314277649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33870917558670044, + "epoch": 5.66, + "learning_rate": 9.55802439318923e-06, + "loss": 0.4387, + "step": 6698, + "task_loss": 0.41287946701049805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35838326811790466, + "epoch": 5.66, + "learning_rate": 9.551986475063398e-06, + "loss": 0.3966, + "step": 6699, + "task_loss": 0.6040765643119812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4094226658344269, + "epoch": 5.66, + "learning_rate": 9.545948556937569e-06, + "loss": 0.514, + "step": 6700, + "task_loss": 1.1427186727523804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.300124853849411, + "epoch": 5.66, + "learning_rate": 9.53991063881174e-06, + "loss": 0.3639, + "step": 6701, + "task_loss": 0.6028019189834595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4802337884902954, + "epoch": 5.66, + "learning_rate": 9.533872720685908e-06, + "loss": 0.4124, + "step": 6702, + "task_loss": 0.9045143127441406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29991692304611206, + "epoch": 5.67, + "learning_rate": 9.527834802560077e-06, + "loss": 0.3858, + "step": 6703, + "task_loss": 0.5465149283409119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6183332204818726, + "epoch": 5.67, + "learning_rate": 9.521796884434248e-06, + "loss": 0.4742, + "step": 6704, + "task_loss": 1.1405081748962402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6776962280273438, + "epoch": 5.67, + "learning_rate": 9.515758966308418e-06, + "loss": 0.5264, + "step": 6705, + "task_loss": 1.1502282619476318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6632376909255981, + "epoch": 5.67, + "learning_rate": 9.509721048182587e-06, + "loss": 0.4702, + "step": 6706, + "task_loss": 0.25684958696365356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46836820244789124, + "epoch": 5.67, + "learning_rate": 9.503683130056756e-06, + "loss": 0.3853, + "step": 6707, + "task_loss": 0.7917205691337585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6035056710243225, + "epoch": 5.67, + "learning_rate": 9.497645211930927e-06, + "loss": 0.6128, + "step": 6708, + "task_loss": 1.3602275848388672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26016175746917725, + "epoch": 5.67, + "learning_rate": 9.491607293805097e-06, + "loss": 0.3558, + "step": 6709, + "task_loss": 0.13024893403053284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36990880966186523, + "epoch": 5.67, + "learning_rate": 9.485569375679266e-06, + "loss": 0.5751, + "step": 6710, + "task_loss": 0.6088328957557678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4181291460990906, + "epoch": 5.67, + "learning_rate": 9.479531457553437e-06, + "loss": 0.4669, + "step": 6711, + "task_loss": 0.36415067315101624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3294379413127899, + "epoch": 5.67, + "learning_rate": 9.473493539427606e-06, + "loss": 0.3633, + "step": 6712, + "task_loss": 0.5303905606269836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1962207853794098, + "epoch": 5.67, + "learning_rate": 9.467455621301776e-06, + "loss": 0.3864, + "step": 6713, + "task_loss": 0.2849324345588684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.447213351726532, + "epoch": 5.67, + "learning_rate": 9.461417703175945e-06, + "loss": 0.3485, + "step": 6714, + "task_loss": 0.48791980743408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3434171676635742, + "epoch": 5.68, + "learning_rate": 9.455379785050116e-06, + "loss": 0.4139, + "step": 6715, + "task_loss": 1.0164415836334229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38467562198638916, + "epoch": 5.68, + "learning_rate": 9.449341866924286e-06, + "loss": 0.3267, + "step": 6716, + "task_loss": 0.37276050448417664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3801423907279968, + "epoch": 5.68, + "learning_rate": 9.443303948798455e-06, + "loss": 0.4988, + "step": 6717, + "task_loss": 1.2556339502334595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.59651118516922, + "epoch": 5.68, + "learning_rate": 9.437266030672624e-06, + "loss": 0.4904, + "step": 6718, + "task_loss": 1.3749816417694092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.336675763130188, + "epoch": 5.68, + "learning_rate": 9.431228112546794e-06, + "loss": 0.4878, + "step": 6719, + "task_loss": 0.05013589933514595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5830874443054199, + "epoch": 5.68, + "learning_rate": 9.425190194420965e-06, + "loss": 0.5124, + "step": 6720, + "task_loss": 1.185306191444397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6072978377342224, + "epoch": 5.68, + "learning_rate": 9.419152276295134e-06, + "loss": 0.5109, + "step": 6721, + "task_loss": 0.5938040614128113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30367282032966614, + "epoch": 5.68, + "learning_rate": 9.413114358169303e-06, + "loss": 0.5102, + "step": 6722, + "task_loss": 0.22702331840991974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5418579578399658, + "epoch": 5.68, + "learning_rate": 9.407076440043473e-06, + "loss": 0.5899, + "step": 6723, + "task_loss": 1.3945199251174927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34819936752319336, + "epoch": 5.68, + "learning_rate": 9.401038521917644e-06, + "loss": 0.5203, + "step": 6724, + "task_loss": 0.5698555111885071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35385438799858093, + "epoch": 5.68, + "learning_rate": 9.395000603791813e-06, + "loss": 0.4758, + "step": 6725, + "task_loss": 0.381542444229126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42342662811279297, + "epoch": 5.69, + "learning_rate": 9.388962685665983e-06, + "loss": 0.4329, + "step": 6726, + "task_loss": 0.5486181378364563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.598606288433075, + "epoch": 5.69, + "learning_rate": 9.382924767540152e-06, + "loss": 0.5019, + "step": 6727, + "task_loss": 0.7312585711479187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5449569821357727, + "epoch": 5.69, + "learning_rate": 9.376886849414323e-06, + "loss": 0.525, + "step": 6728, + "task_loss": 0.7886682152748108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3742581009864807, + "epoch": 5.69, + "learning_rate": 9.370848931288492e-06, + "loss": 0.4147, + "step": 6729, + "task_loss": 0.46247342228889465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4146203398704529, + "epoch": 5.69, + "learning_rate": 9.364811013162662e-06, + "loss": 0.3097, + "step": 6730, + "task_loss": 0.3430410623550415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3924112915992737, + "epoch": 5.69, + "learning_rate": 9.358773095036833e-06, + "loss": 0.5037, + "step": 6731, + "task_loss": 0.9792919158935547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23460236191749573, + "epoch": 5.69, + "learning_rate": 9.352735176911002e-06, + "loss": 0.464, + "step": 6732, + "task_loss": 0.9901689887046814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3037591576576233, + "epoch": 5.69, + "learning_rate": 9.34669725878517e-06, + "loss": 0.4725, + "step": 6733, + "task_loss": 0.8621353507041931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42176276445388794, + "epoch": 5.69, + "learning_rate": 9.340659340659341e-06, + "loss": 0.4571, + "step": 6734, + "task_loss": 0.7237581610679626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4091700315475464, + "epoch": 5.69, + "learning_rate": 9.334621422533512e-06, + "loss": 0.4007, + "step": 6735, + "task_loss": 0.43505626916885376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6379737257957458, + "epoch": 5.69, + "learning_rate": 9.32858350440768e-06, + "loss": 0.7213, + "step": 6736, + "task_loss": 0.5482211709022522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.345699280500412, + "epoch": 5.69, + "learning_rate": 9.32254558628185e-06, + "loss": 0.3487, + "step": 6737, + "task_loss": 1.1413778066635132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40469950437545776, + "epoch": 5.7, + "learning_rate": 9.31650766815602e-06, + "loss": 0.5317, + "step": 6738, + "task_loss": 1.0044339895248413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39531129598617554, + "epoch": 5.7, + "learning_rate": 9.31046975003019e-06, + "loss": 0.3768, + "step": 6739, + "task_loss": 0.2556597590446472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37428170442581177, + "epoch": 5.7, + "learning_rate": 9.30443183190436e-06, + "loss": 0.4787, + "step": 6740, + "task_loss": 0.6278769969940186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4386274814605713, + "epoch": 5.7, + "learning_rate": 9.298393913778528e-06, + "loss": 0.4365, + "step": 6741, + "task_loss": 0.8018932938575745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.319701224565506, + "epoch": 5.7, + "learning_rate": 9.292355995652699e-06, + "loss": 0.4291, + "step": 6742, + "task_loss": 0.5312966108322144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2922467887401581, + "epoch": 5.7, + "learning_rate": 9.28631807752687e-06, + "loss": 0.4232, + "step": 6743, + "task_loss": 0.5795371532440186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4109874665737152, + "epoch": 5.7, + "learning_rate": 9.280280159401038e-06, + "loss": 0.3291, + "step": 6744, + "task_loss": 0.568673312664032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44207483530044556, + "epoch": 5.7, + "learning_rate": 9.274242241275209e-06, + "loss": 0.4103, + "step": 6745, + "task_loss": 0.29672324657440186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35554665327072144, + "epoch": 5.7, + "learning_rate": 9.268204323149378e-06, + "loss": 0.3378, + "step": 6746, + "task_loss": 0.48400193452835083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5641986727714539, + "epoch": 5.7, + "learning_rate": 9.262166405023548e-06, + "loss": 0.5711, + "step": 6747, + "task_loss": 0.37638866901397705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42453595995903015, + "epoch": 5.7, + "learning_rate": 9.256128486897717e-06, + "loss": 0.4423, + "step": 6748, + "task_loss": 0.28984907269477844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4854713976383209, + "epoch": 5.7, + "learning_rate": 9.250090568771888e-06, + "loss": 0.3674, + "step": 6749, + "task_loss": 0.7928124666213989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5808979868888855, + "epoch": 5.71, + "learning_rate": 9.244052650646058e-06, + "loss": 0.3477, + "step": 6750, + "task_loss": 0.40570151805877686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.317829966545105, + "epoch": 5.71, + "learning_rate": 9.238014732520227e-06, + "loss": 0.3822, + "step": 6751, + "task_loss": 0.3280867040157318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25338926911354065, + "epoch": 5.71, + "learning_rate": 9.231976814394396e-06, + "loss": 0.2955, + "step": 6752, + "task_loss": 0.9392604231834412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3951855003833771, + "epoch": 5.71, + "learning_rate": 9.225938896268567e-06, + "loss": 0.3871, + "step": 6753, + "task_loss": 0.5460217595100403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4774053692817688, + "epoch": 5.71, + "learning_rate": 9.219900978142737e-06, + "loss": 0.4375, + "step": 6754, + "task_loss": 1.0834779739379883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32697242498397827, + "epoch": 5.71, + "learning_rate": 9.213863060016908e-06, + "loss": 0.4048, + "step": 6755, + "task_loss": 0.6568202376365662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45566046237945557, + "epoch": 5.71, + "learning_rate": 9.207825141891075e-06, + "loss": 0.5246, + "step": 6756, + "task_loss": 0.24032281339168549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6121571063995361, + "epoch": 5.71, + "learning_rate": 9.201787223765246e-06, + "loss": 0.4573, + "step": 6757, + "task_loss": 1.489749789237976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5307443141937256, + "epoch": 5.71, + "learning_rate": 9.195749305639416e-06, + "loss": 0.4208, + "step": 6758, + "task_loss": 1.1334186792373657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46949708461761475, + "epoch": 5.71, + "learning_rate": 9.189711387513587e-06, + "loss": 0.4778, + "step": 6759, + "task_loss": 0.9315220713615417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43076300621032715, + "epoch": 5.71, + "learning_rate": 9.183673469387756e-06, + "loss": 0.4689, + "step": 6760, + "task_loss": 0.5402387380599976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3442353308200836, + "epoch": 5.71, + "learning_rate": 9.177635551261925e-06, + "loss": 0.4172, + "step": 6761, + "task_loss": 0.293009877204895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5514411330223083, + "epoch": 5.72, + "learning_rate": 9.171597633136095e-06, + "loss": 0.4457, + "step": 6762, + "task_loss": 0.41219499707221985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7351735234260559, + "epoch": 5.72, + "learning_rate": 9.165559715010266e-06, + "loss": 0.496, + "step": 6763, + "task_loss": 1.4759877920150757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2289533019065857, + "epoch": 5.72, + "learning_rate": 9.159521796884435e-06, + "loss": 0.3983, + "step": 6764, + "task_loss": 0.9207218289375305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43548262119293213, + "epoch": 5.72, + "learning_rate": 9.153483878758605e-06, + "loss": 0.4992, + "step": 6765, + "task_loss": 0.33177146315574646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3552061915397644, + "epoch": 5.72, + "learning_rate": 9.147445960632774e-06, + "loss": 0.4658, + "step": 6766, + "task_loss": 1.2425211668014526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32525435090065, + "epoch": 5.72, + "learning_rate": 9.141408042506945e-06, + "loss": 0.4366, + "step": 6767, + "task_loss": 0.27826035022735596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26304855942726135, + "epoch": 5.72, + "learning_rate": 9.135370124381113e-06, + "loss": 0.4448, + "step": 6768, + "task_loss": 0.9599641561508179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4889734387397766, + "epoch": 5.72, + "learning_rate": 9.129332206255284e-06, + "loss": 0.4101, + "step": 6769, + "task_loss": 1.1959550380706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24056431651115417, + "epoch": 5.72, + "learning_rate": 9.123294288129455e-06, + "loss": 0.4447, + "step": 6770, + "task_loss": 0.07262258976697922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5809975862503052, + "epoch": 5.72, + "learning_rate": 9.117256370003622e-06, + "loss": 0.5391, + "step": 6771, + "task_loss": 0.8092759847640991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5652029514312744, + "epoch": 5.72, + "learning_rate": 9.111218451877792e-06, + "loss": 0.4202, + "step": 6772, + "task_loss": 0.7727020382881165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37270236015319824, + "epoch": 5.72, + "learning_rate": 9.105180533751963e-06, + "loss": 0.465, + "step": 6773, + "task_loss": 0.45642298460006714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3966851830482483, + "epoch": 5.73, + "learning_rate": 9.099142615626133e-06, + "loss": 0.5724, + "step": 6774, + "task_loss": 0.16713391244411469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3225492238998413, + "epoch": 5.73, + "learning_rate": 9.093104697500302e-06, + "loss": 0.3695, + "step": 6775, + "task_loss": 0.16621845960617065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4366122782230377, + "epoch": 5.73, + "learning_rate": 9.087066779374471e-06, + "loss": 0.4001, + "step": 6776, + "task_loss": 0.4593633711338043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37281203269958496, + "epoch": 5.73, + "learning_rate": 9.081028861248642e-06, + "loss": 0.4112, + "step": 6777, + "task_loss": 0.3386434316635132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3283293843269348, + "epoch": 5.73, + "learning_rate": 9.074990943122812e-06, + "loss": 0.3803, + "step": 6778, + "task_loss": 0.4193355441093445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4778842329978943, + "epoch": 5.73, + "learning_rate": 9.068953024996981e-06, + "loss": 0.4565, + "step": 6779, + "task_loss": 0.6542305946350098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47622302174568176, + "epoch": 5.73, + "learning_rate": 9.062915106871152e-06, + "loss": 0.4627, + "step": 6780, + "task_loss": 0.20711469650268555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47977280616760254, + "epoch": 5.73, + "learning_rate": 9.05687718874532e-06, + "loss": 0.5215, + "step": 6781, + "task_loss": 1.0503017902374268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29045918583869934, + "epoch": 5.73, + "learning_rate": 9.050839270619491e-06, + "loss": 0.4797, + "step": 6782, + "task_loss": 0.5339498519897461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36565786600112915, + "epoch": 5.73, + "learning_rate": 9.04480135249366e-06, + "loss": 0.424, + "step": 6783, + "task_loss": 0.8045396208763123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47113776206970215, + "epoch": 5.73, + "learning_rate": 9.03876343436783e-06, + "loss": 0.4401, + "step": 6784, + "task_loss": 0.48665308952331543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3332441747188568, + "epoch": 5.73, + "learning_rate": 9.032725516242001e-06, + "loss": 0.3482, + "step": 6785, + "task_loss": 0.5813766121864319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6250608563423157, + "epoch": 5.74, + "learning_rate": 9.02668759811617e-06, + "loss": 0.4354, + "step": 6786, + "task_loss": 0.7294788360595703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26129233837127686, + "epoch": 5.74, + "learning_rate": 9.020649679990339e-06, + "loss": 0.4388, + "step": 6787, + "task_loss": 0.4819594919681549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45985138416290283, + "epoch": 5.74, + "learning_rate": 9.01461176186451e-06, + "loss": 0.4855, + "step": 6788, + "task_loss": 0.851227343082428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17006535828113556, + "epoch": 5.74, + "learning_rate": 9.00857384373868e-06, + "loss": 0.5305, + "step": 6789, + "task_loss": 0.11492697149515152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5755500793457031, + "epoch": 5.74, + "learning_rate": 9.002535925612849e-06, + "loss": 0.4784, + "step": 6790, + "task_loss": 0.6575520038604736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43961772322654724, + "epoch": 5.74, + "learning_rate": 8.996498007487018e-06, + "loss": 0.47, + "step": 6791, + "task_loss": 0.786239504814148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2823607921600342, + "epoch": 5.74, + "learning_rate": 8.990460089361189e-06, + "loss": 0.5403, + "step": 6792, + "task_loss": 0.6331005692481995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43732666969299316, + "epoch": 5.74, + "learning_rate": 8.984422171235359e-06, + "loss": 0.4802, + "step": 6793, + "task_loss": 0.22647225856781006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.680653989315033, + "epoch": 5.74, + "learning_rate": 8.978384253109528e-06, + "loss": 0.4763, + "step": 6794, + "task_loss": 0.7984157800674438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3626134395599365, + "epoch": 5.74, + "learning_rate": 8.972346334983699e-06, + "loss": 0.435, + "step": 6795, + "task_loss": 1.2877767086029053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5412952899932861, + "epoch": 5.74, + "learning_rate": 8.966308416857867e-06, + "loss": 0.4673, + "step": 6796, + "task_loss": 0.7032785415649414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42447182536125183, + "epoch": 5.75, + "learning_rate": 8.960270498732038e-06, + "loss": 0.4847, + "step": 6797, + "task_loss": 0.34944894909858704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4984859228134155, + "epoch": 5.75, + "learning_rate": 8.954232580606207e-06, + "loss": 0.4616, + "step": 6798, + "task_loss": 0.4432244598865509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8174580931663513, + "epoch": 5.75, + "learning_rate": 8.948194662480377e-06, + "loss": 0.5702, + "step": 6799, + "task_loss": 0.366484671831131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21612553298473358, + "epoch": 5.75, + "learning_rate": 8.942156744354548e-06, + "loss": 0.4254, + "step": 6800, + "task_loss": 0.7730886936187744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5667479634284973, + "epoch": 5.75, + "learning_rate": 8.936118826228717e-06, + "loss": 0.4927, + "step": 6801, + "task_loss": 1.341439962387085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7919966578483582, + "epoch": 5.75, + "learning_rate": 8.930080908102886e-06, + "loss": 0.4646, + "step": 6802, + "task_loss": 0.6175392866134644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26226508617401123, + "epoch": 5.75, + "learning_rate": 8.924042989977056e-06, + "loss": 0.5002, + "step": 6803, + "task_loss": 0.496273934841156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7612067461013794, + "epoch": 5.75, + "learning_rate": 8.918005071851227e-06, + "loss": 0.4662, + "step": 6804, + "task_loss": 0.5117945075035095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4614354074001312, + "epoch": 5.75, + "learning_rate": 8.911967153725396e-06, + "loss": 0.5291, + "step": 6805, + "task_loss": 1.041252613067627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37100714445114136, + "epoch": 5.75, + "learning_rate": 8.905929235599565e-06, + "loss": 0.4614, + "step": 6806, + "task_loss": 1.1394340991973877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3699692487716675, + "epoch": 5.75, + "learning_rate": 8.899891317473735e-06, + "loss": 0.4718, + "step": 6807, + "task_loss": 0.21843348443508148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4842161536216736, + "epoch": 5.75, + "learning_rate": 8.893853399347906e-06, + "loss": 0.5441, + "step": 6808, + "task_loss": 1.2731599807739258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3685823678970337, + "epoch": 5.76, + "learning_rate": 8.887815481222075e-06, + "loss": 0.4217, + "step": 6809, + "task_loss": 0.3689992129802704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45533621311187744, + "epoch": 5.76, + "learning_rate": 8.881777563096245e-06, + "loss": 0.5183, + "step": 6810, + "task_loss": 0.11185700446367264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5002017617225647, + "epoch": 5.76, + "learning_rate": 8.875739644970414e-06, + "loss": 0.4604, + "step": 6811, + "task_loss": 0.32641032338142395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3219699561595917, + "epoch": 5.76, + "learning_rate": 8.869701726844585e-06, + "loss": 0.3196, + "step": 6812, + "task_loss": 0.45952340960502625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25067338347435, + "epoch": 5.76, + "learning_rate": 8.863663808718754e-06, + "loss": 0.4132, + "step": 6813, + "task_loss": 0.4273223280906677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21826279163360596, + "epoch": 5.76, + "learning_rate": 8.857625890592924e-06, + "loss": 0.2463, + "step": 6814, + "task_loss": 0.7874842882156372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6446139216423035, + "epoch": 5.76, + "learning_rate": 8.851587972467095e-06, + "loss": 0.5081, + "step": 6815, + "task_loss": 1.2316144704818726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4131598472595215, + "epoch": 5.76, + "learning_rate": 8.845550054341264e-06, + "loss": 0.6092, + "step": 6816, + "task_loss": 0.7948697805404663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5868838429450989, + "epoch": 5.76, + "learning_rate": 8.839512136215432e-06, + "loss": 0.5729, + "step": 6817, + "task_loss": 1.8779619932174683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3616679310798645, + "epoch": 5.76, + "learning_rate": 8.833474218089603e-06, + "loss": 0.4276, + "step": 6818, + "task_loss": 0.39233511686325073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5627244710922241, + "epoch": 5.76, + "learning_rate": 8.827436299963774e-06, + "loss": 0.4294, + "step": 6819, + "task_loss": 0.9320204854011536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5246750116348267, + "epoch": 5.76, + "learning_rate": 8.821398381837944e-06, + "loss": 0.4661, + "step": 6820, + "task_loss": 0.41106876730918884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17428657412528992, + "epoch": 5.77, + "learning_rate": 8.815360463712111e-06, + "loss": 0.3551, + "step": 6821, + "task_loss": 0.3434470295906067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1982969343662262, + "epoch": 5.77, + "learning_rate": 8.809322545586282e-06, + "loss": 0.4446, + "step": 6822, + "task_loss": 0.1066550761461258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3522729277610779, + "epoch": 5.77, + "learning_rate": 8.803284627460452e-06, + "loss": 0.565, + "step": 6823, + "task_loss": 0.7793720364570618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6030377149581909, + "epoch": 5.77, + "learning_rate": 8.797246709334623e-06, + "loss": 0.4068, + "step": 6824, + "task_loss": 0.13307474553585052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6200074553489685, + "epoch": 5.77, + "learning_rate": 8.791208791208792e-06, + "loss": 0.404, + "step": 6825, + "task_loss": 1.0221736431121826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46882468461990356, + "epoch": 5.77, + "learning_rate": 8.78517087308296e-06, + "loss": 0.4568, + "step": 6826, + "task_loss": 0.7106450796127319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37567901611328125, + "epoch": 5.77, + "learning_rate": 8.779132954957131e-06, + "loss": 0.4523, + "step": 6827, + "task_loss": 0.24244384467601776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9290674924850464, + "epoch": 5.77, + "learning_rate": 8.773095036831302e-06, + "loss": 0.59, + "step": 6828, + "task_loss": 1.031914234161377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36427628993988037, + "epoch": 5.77, + "learning_rate": 8.76705711870547e-06, + "loss": 0.456, + "step": 6829, + "task_loss": 0.2579890787601471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2719225287437439, + "epoch": 5.77, + "learning_rate": 8.76101920057964e-06, + "loss": 0.4575, + "step": 6830, + "task_loss": 0.061803411692380905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.344133198261261, + "epoch": 5.77, + "learning_rate": 8.75498128245381e-06, + "loss": 0.5546, + "step": 6831, + "task_loss": 0.3911244571208954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8373414278030396, + "epoch": 5.77, + "learning_rate": 8.74894336432798e-06, + "loss": 0.5159, + "step": 6832, + "task_loss": 0.920238196849823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2709845304489136, + "epoch": 5.78, + "learning_rate": 8.74290544620215e-06, + "loss": 0.5485, + "step": 6833, + "task_loss": 1.269283413887024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4290524125099182, + "epoch": 5.78, + "learning_rate": 8.73686752807632e-06, + "loss": 0.4162, + "step": 6834, + "task_loss": 0.7809634804725647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41308027505874634, + "epoch": 5.78, + "learning_rate": 8.73082960995049e-06, + "loss": 0.5419, + "step": 6835, + "task_loss": 0.9451996684074402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34122925996780396, + "epoch": 5.78, + "learning_rate": 8.724791691824658e-06, + "loss": 0.4626, + "step": 6836, + "task_loss": 0.22515572607517242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3117615282535553, + "epoch": 5.78, + "learning_rate": 8.718753773698829e-06, + "loss": 0.3745, + "step": 6837, + "task_loss": 0.4015805125236511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47680628299713135, + "epoch": 5.78, + "learning_rate": 8.712715855573e-06, + "loss": 0.5481, + "step": 6838, + "task_loss": 0.6055381894111633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3258705139160156, + "epoch": 5.78, + "learning_rate": 8.70667793744717e-06, + "loss": 0.3998, + "step": 6839, + "task_loss": 1.194960117340088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6732715368270874, + "epoch": 5.78, + "learning_rate": 8.700640019321337e-06, + "loss": 0.5333, + "step": 6840, + "task_loss": 1.1332221031188965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37896040081977844, + "epoch": 5.78, + "learning_rate": 8.694602101195508e-06, + "loss": 0.3891, + "step": 6841, + "task_loss": 0.7380682229995728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21282166242599487, + "epoch": 5.78, + "learning_rate": 8.688564183069678e-06, + "loss": 0.4203, + "step": 6842, + "task_loss": 0.21935594081878662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3921148180961609, + "epoch": 5.78, + "learning_rate": 8.682526264943849e-06, + "loss": 0.3731, + "step": 6843, + "task_loss": 0.18771642446517944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6807649731636047, + "epoch": 5.78, + "learning_rate": 8.676488346818018e-06, + "loss": 0.476, + "step": 6844, + "task_loss": 1.7016767263412476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2680549621582031, + "epoch": 5.79, + "learning_rate": 8.670450428692186e-06, + "loss": 0.3536, + "step": 6845, + "task_loss": 0.39849528670310974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4472857117652893, + "epoch": 5.79, + "learning_rate": 8.664412510566357e-06, + "loss": 0.4865, + "step": 6846, + "task_loss": 0.3608029782772064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3966866135597229, + "epoch": 5.79, + "learning_rate": 8.658374592440528e-06, + "loss": 0.4353, + "step": 6847, + "task_loss": 0.16099335253238678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3421272933483124, + "epoch": 5.79, + "learning_rate": 8.652336674314696e-06, + "loss": 0.4863, + "step": 6848, + "task_loss": 0.5604204535484314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31521034240722656, + "epoch": 5.79, + "learning_rate": 8.646298756188867e-06, + "loss": 0.3654, + "step": 6849, + "task_loss": 1.3463664054870605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.289031982421875, + "epoch": 5.79, + "learning_rate": 8.640260838063036e-06, + "loss": 0.369, + "step": 6850, + "task_loss": 0.21315786242485046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3626205325126648, + "epoch": 5.79, + "learning_rate": 8.634222919937206e-06, + "loss": 0.358, + "step": 6851, + "task_loss": 0.3071889281272888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23373094201087952, + "epoch": 5.79, + "learning_rate": 8.628185001811375e-06, + "loss": 0.3825, + "step": 6852, + "task_loss": 0.38108542561531067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3998470902442932, + "epoch": 5.79, + "learning_rate": 8.622147083685546e-06, + "loss": 0.4349, + "step": 6853, + "task_loss": 1.0409437417984009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4545266330242157, + "epoch": 5.79, + "learning_rate": 8.616109165559716e-06, + "loss": 0.5326, + "step": 6854, + "task_loss": 0.8539249897003174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4730611741542816, + "epoch": 5.79, + "learning_rate": 8.610071247433885e-06, + "loss": 0.4485, + "step": 6855, + "task_loss": 0.7993516325950623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5145604014396667, + "epoch": 5.79, + "learning_rate": 8.604033329308054e-06, + "loss": 0.3723, + "step": 6856, + "task_loss": 0.6746289730072021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47001469135284424, + "epoch": 5.8, + "learning_rate": 8.597995411182225e-06, + "loss": 0.484, + "step": 6857, + "task_loss": 0.7576054930686951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5502407550811768, + "epoch": 5.8, + "learning_rate": 8.591957493056395e-06, + "loss": 0.5648, + "step": 6858, + "task_loss": 0.867617666721344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.237340047955513, + "epoch": 5.8, + "learning_rate": 8.585919574930564e-06, + "loss": 0.3823, + "step": 6859, + "task_loss": 0.8760807514190674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5560070872306824, + "epoch": 5.8, + "learning_rate": 8.579881656804733e-06, + "loss": 0.4407, + "step": 6860, + "task_loss": 0.9598127603530884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2689327895641327, + "epoch": 5.8, + "learning_rate": 8.573843738678904e-06, + "loss": 0.3381, + "step": 6861, + "task_loss": 0.6622861623764038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.526469886302948, + "epoch": 5.8, + "learning_rate": 8.567805820553074e-06, + "loss": 0.4803, + "step": 6862, + "task_loss": 1.2142293453216553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34447401762008667, + "epoch": 5.8, + "learning_rate": 8.561767902427243e-06, + "loss": 0.3621, + "step": 6863, + "task_loss": 1.0713777542114258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4919794499874115, + "epoch": 5.8, + "learning_rate": 8.555729984301414e-06, + "loss": 0.425, + "step": 6864, + "task_loss": 0.9440869092941284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.11292891204357147, + "epoch": 5.8, + "learning_rate": 8.549692066175583e-06, + "loss": 0.366, + "step": 6865, + "task_loss": 0.010360435582697392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5620462894439697, + "epoch": 5.8, + "learning_rate": 8.543654148049753e-06, + "loss": 0.3998, + "step": 6866, + "task_loss": 0.759792685508728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29607874155044556, + "epoch": 5.8, + "learning_rate": 8.537616229923922e-06, + "loss": 0.5225, + "step": 6867, + "task_loss": 0.45389482378959656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5657116174697876, + "epoch": 5.81, + "learning_rate": 8.531578311798093e-06, + "loss": 0.4715, + "step": 6868, + "task_loss": 1.5670477151870728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4624946713447571, + "epoch": 5.81, + "learning_rate": 8.525540393672263e-06, + "loss": 0.49, + "step": 6869, + "task_loss": 0.7926468849182129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4558680057525635, + "epoch": 5.81, + "learning_rate": 8.519502475546432e-06, + "loss": 0.4301, + "step": 6870, + "task_loss": 0.7351441383361816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3619242310523987, + "epoch": 5.81, + "learning_rate": 8.513464557420601e-06, + "loss": 0.4739, + "step": 6871, + "task_loss": 0.4874288737773895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34402012825012207, + "epoch": 5.81, + "learning_rate": 8.507426639294772e-06, + "loss": 0.4631, + "step": 6872, + "task_loss": 1.2328965663909912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4837105870246887, + "epoch": 5.81, + "learning_rate": 8.501388721168942e-06, + "loss": 0.4849, + "step": 6873, + "task_loss": 0.40857362747192383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31597450375556946, + "epoch": 5.81, + "learning_rate": 8.495350803043111e-06, + "loss": 0.317, + "step": 6874, + "task_loss": 0.38574692606925964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31182849407196045, + "epoch": 5.81, + "learning_rate": 8.48931288491728e-06, + "loss": 0.4668, + "step": 6875, + "task_loss": 0.3975023627281189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5948579907417297, + "epoch": 5.81, + "learning_rate": 8.48327496679145e-06, + "loss": 0.4358, + "step": 6876, + "task_loss": 0.3158024251461029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3936498761177063, + "epoch": 5.81, + "learning_rate": 8.477237048665621e-06, + "loss": 0.3592, + "step": 6877, + "task_loss": 0.43681928515434265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42685794830322266, + "epoch": 5.81, + "learning_rate": 8.47119913053979e-06, + "loss": 0.4931, + "step": 6878, + "task_loss": 0.5872896313667297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4572516679763794, + "epoch": 5.81, + "learning_rate": 8.46516121241396e-06, + "loss": 0.4981, + "step": 6879, + "task_loss": 1.0090363025665283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39667075872421265, + "epoch": 5.82, + "learning_rate": 8.45912329428813e-06, + "loss": 0.4918, + "step": 6880, + "task_loss": 0.05588139221072197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3507067859172821, + "epoch": 5.82, + "learning_rate": 8.4530853761623e-06, + "loss": 0.4531, + "step": 6881, + "task_loss": 0.9461204409599304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41787266731262207, + "epoch": 5.82, + "learning_rate": 8.447047458036469e-06, + "loss": 0.4707, + "step": 6882, + "task_loss": 0.8120055794715881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3385689854621887, + "epoch": 5.82, + "learning_rate": 8.44100953991064e-06, + "loss": 0.3802, + "step": 6883, + "task_loss": 0.585078775882721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3481976389884949, + "epoch": 5.82, + "learning_rate": 8.43497162178481e-06, + "loss": 0.2832, + "step": 6884, + "task_loss": 0.6310349106788635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2829192876815796, + "epoch": 5.82, + "learning_rate": 8.428933703658979e-06, + "loss": 0.4389, + "step": 6885, + "task_loss": 0.2057802379131317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38993045687675476, + "epoch": 5.82, + "learning_rate": 8.422895785533148e-06, + "loss": 0.5308, + "step": 6886, + "task_loss": 0.7504974007606506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47812265157699585, + "epoch": 5.82, + "learning_rate": 8.416857867407318e-06, + "loss": 0.4516, + "step": 6887, + "task_loss": 0.9430537819862366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36125704646110535, + "epoch": 5.82, + "learning_rate": 8.410819949281489e-06, + "loss": 0.3086, + "step": 6888, + "task_loss": 0.40589481592178345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6667785048484802, + "epoch": 5.82, + "learning_rate": 8.40478203115566e-06, + "loss": 0.4718, + "step": 6889, + "task_loss": 0.5275201797485352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6534362435340881, + "epoch": 5.82, + "learning_rate": 8.398744113029827e-06, + "loss": 0.5837, + "step": 6890, + "task_loss": 0.856420636177063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3661806881427765, + "epoch": 5.82, + "learning_rate": 8.392706194903997e-06, + "loss": 0.4569, + "step": 6891, + "task_loss": 0.4163728654384613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2615051865577698, + "epoch": 5.83, + "learning_rate": 8.386668276778168e-06, + "loss": 0.4235, + "step": 6892, + "task_loss": 0.01779782585799694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.307868629693985, + "epoch": 5.83, + "learning_rate": 8.380630358652338e-06, + "loss": 0.4214, + "step": 6893, + "task_loss": 0.17824146151542664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2924744486808777, + "epoch": 5.83, + "learning_rate": 8.374592440526507e-06, + "loss": 0.4645, + "step": 6894, + "task_loss": 1.3725001811981201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2637721300125122, + "epoch": 5.83, + "learning_rate": 8.368554522400676e-06, + "loss": 0.3707, + "step": 6895, + "task_loss": 0.7852872014045715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28147104382514954, + "epoch": 5.83, + "learning_rate": 8.362516604274847e-06, + "loss": 0.6144, + "step": 6896, + "task_loss": 0.68956059217453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3505222797393799, + "epoch": 5.83, + "learning_rate": 8.356478686149015e-06, + "loss": 0.5321, + "step": 6897, + "task_loss": 0.730493426322937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45047926902770996, + "epoch": 5.83, + "learning_rate": 8.350440768023186e-06, + "loss": 0.4641, + "step": 6898, + "task_loss": 0.8506103157997131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21070575714111328, + "epoch": 5.83, + "learning_rate": 8.344402849897357e-06, + "loss": 0.351, + "step": 6899, + "task_loss": 0.04980315640568733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6946448683738708, + "epoch": 5.83, + "learning_rate": 8.338364931771525e-06, + "loss": 0.4405, + "step": 6900, + "task_loss": 1.2568155527114868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28478628396987915, + "epoch": 5.83, + "learning_rate": 8.332327013645694e-06, + "loss": 0.5711, + "step": 6901, + "task_loss": 0.5820723176002502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5324742794036865, + "epoch": 5.83, + "learning_rate": 8.326289095519865e-06, + "loss": 0.4903, + "step": 6902, + "task_loss": 0.49629703164100647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34382620453834534, + "epoch": 5.83, + "learning_rate": 8.320251177394036e-06, + "loss": 0.4026, + "step": 6903, + "task_loss": 1.1378055810928345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34479331970214844, + "epoch": 5.84, + "learning_rate": 8.314213259268206e-06, + "loss": 0.3242, + "step": 6904, + "task_loss": 0.8123185634613037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22207149863243103, + "epoch": 5.84, + "learning_rate": 8.308175341142373e-06, + "loss": 0.373, + "step": 6905, + "task_loss": 0.20571757853031158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2800230085849762, + "epoch": 5.84, + "learning_rate": 8.302137423016544e-06, + "loss": 0.411, + "step": 6906, + "task_loss": 0.383259654045105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2858198583126068, + "epoch": 5.84, + "learning_rate": 8.296099504890714e-06, + "loss": 0.4757, + "step": 6907, + "task_loss": 0.047231972217559814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43270570039749146, + "epoch": 5.84, + "learning_rate": 8.290061586764885e-06, + "loss": 0.4351, + "step": 6908, + "task_loss": 0.49198442697525024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3998495936393738, + "epoch": 5.84, + "learning_rate": 8.284023668639054e-06, + "loss": 0.4562, + "step": 6909, + "task_loss": 0.6213709712028503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4395737648010254, + "epoch": 5.84, + "learning_rate": 8.277985750513223e-06, + "loss": 0.4554, + "step": 6910, + "task_loss": 0.7981939911842346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39473816752433777, + "epoch": 5.84, + "learning_rate": 8.271947832387393e-06, + "loss": 0.4472, + "step": 6911, + "task_loss": 0.5105046629905701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6238789558410645, + "epoch": 5.84, + "learning_rate": 8.265909914261564e-06, + "loss": 0.613, + "step": 6912, + "task_loss": 1.0496739149093628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4039442837238312, + "epoch": 5.84, + "learning_rate": 8.259871996135733e-06, + "loss": 0.4631, + "step": 6913, + "task_loss": 1.0193341970443726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48370087146759033, + "epoch": 5.84, + "learning_rate": 8.253834078009903e-06, + "loss": 0.3788, + "step": 6914, + "task_loss": 0.46260449290275574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29194003343582153, + "epoch": 5.84, + "learning_rate": 8.247796159884072e-06, + "loss": 0.3284, + "step": 6915, + "task_loss": 0.6137953996658325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41501322388648987, + "epoch": 5.85, + "learning_rate": 8.241758241758243e-06, + "loss": 0.3267, + "step": 6916, + "task_loss": 0.3210653066635132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3291185796260834, + "epoch": 5.85, + "learning_rate": 8.235720323632412e-06, + "loss": 0.3794, + "step": 6917, + "task_loss": 0.24007408320903778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30583199858665466, + "epoch": 5.85, + "learning_rate": 8.229682405506582e-06, + "loss": 0.4545, + "step": 6918, + "task_loss": 0.649651825428009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5069676637649536, + "epoch": 5.85, + "learning_rate": 8.223644487380751e-06, + "loss": 0.5291, + "step": 6919, + "task_loss": 0.5986087322235107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4521918296813965, + "epoch": 5.85, + "learning_rate": 8.217606569254922e-06, + "loss": 0.3647, + "step": 6920, + "task_loss": 1.0398566722869873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48589885234832764, + "epoch": 5.85, + "learning_rate": 8.21156865112909e-06, + "loss": 0.4215, + "step": 6921, + "task_loss": 0.5031166076660156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3149758577346802, + "epoch": 5.85, + "learning_rate": 8.205530733003261e-06, + "loss": 0.3585, + "step": 6922, + "task_loss": 0.7229981422424316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5669782161712646, + "epoch": 5.85, + "learning_rate": 8.199492814877432e-06, + "loss": 0.4234, + "step": 6923, + "task_loss": 0.703351616859436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3199085593223572, + "epoch": 5.85, + "learning_rate": 8.1934548967516e-06, + "loss": 0.3994, + "step": 6924, + "task_loss": 0.3237999677658081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4833778142929077, + "epoch": 5.85, + "learning_rate": 8.18741697862577e-06, + "loss": 0.6103, + "step": 6925, + "task_loss": 0.32273244857788086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31481650471687317, + "epoch": 5.85, + "learning_rate": 8.18137906049994e-06, + "loss": 0.4179, + "step": 6926, + "task_loss": 0.3099425435066223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6069097518920898, + "epoch": 5.85, + "learning_rate": 8.17534114237411e-06, + "loss": 0.3595, + "step": 6927, + "task_loss": 0.3710533678531647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44832298159599304, + "epoch": 5.86, + "learning_rate": 8.16930322424828e-06, + "loss": 0.4228, + "step": 6928, + "task_loss": 0.6668354868888855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3236726224422455, + "epoch": 5.86, + "learning_rate": 8.163265306122448e-06, + "loss": 0.4571, + "step": 6929, + "task_loss": 0.49051401019096375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4685375392436981, + "epoch": 5.86, + "learning_rate": 8.157227387996619e-06, + "loss": 0.4664, + "step": 6930, + "task_loss": 0.18978945910930634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5885454416275024, + "epoch": 5.86, + "learning_rate": 8.15118946987079e-06, + "loss": 0.5502, + "step": 6931, + "task_loss": 1.0149588584899902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3576107621192932, + "epoch": 5.86, + "learning_rate": 8.145151551744958e-06, + "loss": 0.5249, + "step": 6932, + "task_loss": 1.1126909255981445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4122850298881531, + "epoch": 5.86, + "learning_rate": 8.139113633619129e-06, + "loss": 0.4226, + "step": 6933, + "task_loss": 0.3238452970981598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4157611131668091, + "epoch": 5.86, + "learning_rate": 8.133075715493298e-06, + "loss": 0.5139, + "step": 6934, + "task_loss": 0.25755226612091064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5746082067489624, + "epoch": 5.86, + "learning_rate": 8.127037797367468e-06, + "loss": 0.3502, + "step": 6935, + "task_loss": 1.0045945644378662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.681090772151947, + "epoch": 5.86, + "learning_rate": 8.120999879241637e-06, + "loss": 0.4626, + "step": 6936, + "task_loss": 0.7141767740249634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3222769498825073, + "epoch": 5.86, + "learning_rate": 8.114961961115808e-06, + "loss": 0.3979, + "step": 6937, + "task_loss": 0.4712897837162018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31626230478286743, + "epoch": 5.86, + "learning_rate": 8.108924042989978e-06, + "loss": 0.4332, + "step": 6938, + "task_loss": 0.0822918638586998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29413169622421265, + "epoch": 5.87, + "learning_rate": 8.102886124864147e-06, + "loss": 0.3332, + "step": 6939, + "task_loss": 0.5898066759109497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4244046211242676, + "epoch": 5.87, + "learning_rate": 8.096848206738316e-06, + "loss": 0.39, + "step": 6940, + "task_loss": 0.31856802105903625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3980801999568939, + "epoch": 5.87, + "learning_rate": 8.090810288612487e-06, + "loss": 0.4856, + "step": 6941, + "task_loss": 0.845194399356842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35983023047447205, + "epoch": 5.87, + "learning_rate": 8.084772370486657e-06, + "loss": 0.5219, + "step": 6942, + "task_loss": 0.12037420272827148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44201192259788513, + "epoch": 5.87, + "learning_rate": 8.078734452360826e-06, + "loss": 0.632, + "step": 6943, + "task_loss": 0.3167547285556793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3107828199863434, + "epoch": 5.87, + "learning_rate": 8.072696534234995e-06, + "loss": 0.4113, + "step": 6944, + "task_loss": 0.7686514854431152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2514476180076599, + "epoch": 5.87, + "learning_rate": 8.066658616109166e-06, + "loss": 0.4024, + "step": 6945, + "task_loss": 0.7811295390129089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22642725706100464, + "epoch": 5.87, + "learning_rate": 8.060620697983336e-06, + "loss": 0.4253, + "step": 6946, + "task_loss": 0.45459866523742676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37283703684806824, + "epoch": 5.87, + "learning_rate": 8.054582779857505e-06, + "loss": 0.3389, + "step": 6947, + "task_loss": 0.5623932480812073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6369295716285706, + "epoch": 5.87, + "learning_rate": 8.048544861731676e-06, + "loss": 0.4503, + "step": 6948, + "task_loss": 1.0879145860671997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22271110117435455, + "epoch": 5.87, + "learning_rate": 8.042506943605845e-06, + "loss": 0.3224, + "step": 6949, + "task_loss": 0.2093796730041504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4541492760181427, + "epoch": 5.87, + "learning_rate": 8.036469025480015e-06, + "loss": 0.4003, + "step": 6950, + "task_loss": 0.9150739908218384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6458548903465271, + "epoch": 5.88, + "learning_rate": 8.030431107354184e-06, + "loss": 0.5135, + "step": 6951, + "task_loss": 1.5363305807113647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3882969617843628, + "epoch": 5.88, + "learning_rate": 8.024393189228355e-06, + "loss": 0.5052, + "step": 6952, + "task_loss": 0.419509619474411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5319870710372925, + "epoch": 5.88, + "learning_rate": 8.018355271102525e-06, + "loss": 0.3907, + "step": 6953, + "task_loss": 1.2335803508758545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20668920874595642, + "epoch": 5.88, + "learning_rate": 8.012317352976694e-06, + "loss": 0.4689, + "step": 6954, + "task_loss": 0.02250358648598194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3774451017379761, + "epoch": 5.88, + "learning_rate": 8.006279434850863e-06, + "loss": 0.3553, + "step": 6955, + "task_loss": 0.6751906275749207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31259045004844666, + "epoch": 5.88, + "learning_rate": 8.000241516725033e-06, + "loss": 0.4625, + "step": 6956, + "task_loss": 0.5203465819358826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5578932166099548, + "epoch": 5.88, + "learning_rate": 7.994203598599204e-06, + "loss": 0.5615, + "step": 6957, + "task_loss": 0.7862531542778015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3468993902206421, + "epoch": 5.88, + "learning_rate": 7.988165680473373e-06, + "loss": 0.4295, + "step": 6958, + "task_loss": 0.5802791118621826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43521279096603394, + "epoch": 5.88, + "learning_rate": 7.982127762347542e-06, + "loss": 0.4418, + "step": 6959, + "task_loss": 0.6512595415115356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31035059690475464, + "epoch": 5.88, + "learning_rate": 7.976089844221712e-06, + "loss": 0.4464, + "step": 6960, + "task_loss": 1.3289785385131836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5049430727958679, + "epoch": 5.88, + "learning_rate": 7.970051926095883e-06, + "loss": 0.4104, + "step": 6961, + "task_loss": 0.5964776873588562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31972306966781616, + "epoch": 5.88, + "learning_rate": 7.964014007970052e-06, + "loss": 0.4856, + "step": 6962, + "task_loss": 0.7360772490501404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3082845211029053, + "epoch": 5.89, + "learning_rate": 7.957976089844222e-06, + "loss": 0.4448, + "step": 6963, + "task_loss": 0.47658249735832214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1652650535106659, + "epoch": 5.89, + "learning_rate": 7.951938171718391e-06, + "loss": 0.4013, + "step": 6964, + "task_loss": 0.12744301557540894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.384038507938385, + "epoch": 5.89, + "learning_rate": 7.945900253592562e-06, + "loss": 0.5222, + "step": 6965, + "task_loss": 1.5211560726165771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2479127049446106, + "epoch": 5.89, + "learning_rate": 7.93986233546673e-06, + "loss": 0.451, + "step": 6966, + "task_loss": 0.10588888823986053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43977290391921997, + "epoch": 5.89, + "learning_rate": 7.933824417340901e-06, + "loss": 0.4231, + "step": 6967, + "task_loss": 0.7373864650726318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22163361310958862, + "epoch": 5.89, + "learning_rate": 7.927786499215072e-06, + "loss": 0.4501, + "step": 6968, + "task_loss": 0.41722995042800903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4625582695007324, + "epoch": 5.89, + "learning_rate": 7.92174858108924e-06, + "loss": 0.4491, + "step": 6969, + "task_loss": 1.1094281673431396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.15898041427135468, + "epoch": 5.89, + "learning_rate": 7.91571066296341e-06, + "loss": 0.2682, + "step": 6970, + "task_loss": 0.13250593841075897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.452730655670166, + "epoch": 5.89, + "learning_rate": 7.90967274483758e-06, + "loss": 0.4133, + "step": 6971, + "task_loss": 0.25173208117485046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4305339753627777, + "epoch": 5.89, + "learning_rate": 7.90363482671175e-06, + "loss": 0.4637, + "step": 6972, + "task_loss": 1.0811203718185425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7033300399780273, + "epoch": 5.89, + "learning_rate": 7.897596908585921e-06, + "loss": 0.5198, + "step": 6973, + "task_loss": 0.6781212687492371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6664816737174988, + "epoch": 5.89, + "learning_rate": 7.891558990460088e-06, + "loss": 0.5443, + "step": 6974, + "task_loss": 1.5009799003601074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46847623586654663, + "epoch": 5.9, + "learning_rate": 7.885521072334259e-06, + "loss": 0.4898, + "step": 6975, + "task_loss": 0.7659034132957458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4427796006202698, + "epoch": 5.9, + "learning_rate": 7.87948315420843e-06, + "loss": 0.5193, + "step": 6976, + "task_loss": 0.6218733191490173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4531594514846802, + "epoch": 5.9, + "learning_rate": 7.8734452360826e-06, + "loss": 0.6058, + "step": 6977, + "task_loss": 0.8583329319953918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39716672897338867, + "epoch": 5.9, + "learning_rate": 7.867407317956769e-06, + "loss": 0.5949, + "step": 6978, + "task_loss": 0.5925692319869995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20717518031597137, + "epoch": 5.9, + "learning_rate": 7.861369399830938e-06, + "loss": 0.4199, + "step": 6979, + "task_loss": 0.21700134873390198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3442147970199585, + "epoch": 5.9, + "learning_rate": 7.855331481705108e-06, + "loss": 0.396, + "step": 6980, + "task_loss": 0.5193817019462585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39888426661491394, + "epoch": 5.9, + "learning_rate": 7.849293563579279e-06, + "loss": 0.4422, + "step": 6981, + "task_loss": 0.5411069989204407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24404998123645782, + "epoch": 5.9, + "learning_rate": 7.843255645453448e-06, + "loss": 0.3713, + "step": 6982, + "task_loss": 0.2543105185031891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29963618516921997, + "epoch": 5.9, + "learning_rate": 7.837217727327619e-06, + "loss": 0.5896, + "step": 6983, + "task_loss": 0.38067957758903503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37129101157188416, + "epoch": 5.9, + "learning_rate": 7.831179809201787e-06, + "loss": 0.3956, + "step": 6984, + "task_loss": 0.7263840436935425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4122962951660156, + "epoch": 5.9, + "learning_rate": 7.825141891075958e-06, + "loss": 0.4524, + "step": 6985, + "task_loss": 0.2442743182182312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5730170011520386, + "epoch": 5.9, + "learning_rate": 7.819103972950127e-06, + "loss": 0.3797, + "step": 6986, + "task_loss": 0.5021531581878662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.377128005027771, + "epoch": 5.91, + "learning_rate": 7.813066054824297e-06, + "loss": 0.4749, + "step": 6987, + "task_loss": 0.20708145201206207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.401876837015152, + "epoch": 5.91, + "learning_rate": 7.807028136698468e-06, + "loss": 0.4051, + "step": 6988, + "task_loss": 1.0245081186294556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3383440375328064, + "epoch": 5.91, + "learning_rate": 7.800990218572637e-06, + "loss": 0.3659, + "step": 6989, + "task_loss": 0.5507072806358337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5728752613067627, + "epoch": 5.91, + "learning_rate": 7.794952300446806e-06, + "loss": 0.4041, + "step": 6990, + "task_loss": 1.4205158948898315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3251244127750397, + "epoch": 5.91, + "learning_rate": 7.788914382320976e-06, + "loss": 0.3733, + "step": 6991, + "task_loss": 0.8480383157730103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6370258331298828, + "epoch": 5.91, + "learning_rate": 7.782876464195147e-06, + "loss": 0.4515, + "step": 6992, + "task_loss": 0.43635597825050354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4869873523712158, + "epoch": 5.91, + "learning_rate": 7.776838546069316e-06, + "loss": 0.4822, + "step": 6993, + "task_loss": 0.5614396333694458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47066739201545715, + "epoch": 5.91, + "learning_rate": 7.770800627943485e-06, + "loss": 0.4863, + "step": 6994, + "task_loss": 1.292590856552124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5118110179901123, + "epoch": 5.91, + "learning_rate": 7.764762709817655e-06, + "loss": 0.5562, + "step": 6995, + "task_loss": 1.192577600479126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27334535121917725, + "epoch": 5.91, + "learning_rate": 7.758724791691826e-06, + "loss": 0.3563, + "step": 6996, + "task_loss": 0.6750822067260742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3604450821876526, + "epoch": 5.91, + "learning_rate": 7.752686873565995e-06, + "loss": 0.4196, + "step": 6997, + "task_loss": 0.9979363679885864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.485706627368927, + "epoch": 5.91, + "learning_rate": 7.746648955440165e-06, + "loss": 0.5278, + "step": 6998, + "task_loss": 0.9236827492713928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5276333093643188, + "epoch": 5.92, + "learning_rate": 7.740611037314334e-06, + "loss": 0.5335, + "step": 6999, + "task_loss": 0.499744176864624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3345971703529358, + "epoch": 5.92, + "learning_rate": 7.734573119188505e-06, + "loss": 0.414, + "step": 7000, + "task_loss": 0.4801103174686432 + }, + { + "epoch": 5.92, + "eval_accuracy": 0.9103762376237624, + "eval_loss": 0.2805521488189697, + "eval_runtime": 227.887, + "eval_samples_per_second": 110.801, + "eval_steps_per_second": 0.869, + "step": 7000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37046775221824646, + "epoch": 5.92, + "learning_rate": 7.728535201062674e-06, + "loss": 0.3694, + "step": 7001, + "task_loss": 1.0790047645568848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43704119324684143, + "epoch": 5.92, + "learning_rate": 7.722497282936844e-06, + "loss": 0.498, + "step": 7002, + "task_loss": 0.2983787953853607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19424527883529663, + "epoch": 5.92, + "learning_rate": 7.716459364811015e-06, + "loss": 0.3969, + "step": 7003, + "task_loss": 1.0791457891464233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.580791175365448, + "epoch": 5.92, + "learning_rate": 7.710421446685184e-06, + "loss": 0.4919, + "step": 7004, + "task_loss": 0.42992478609085083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3083846867084503, + "epoch": 5.92, + "learning_rate": 7.704383528559352e-06, + "loss": 0.4861, + "step": 7005, + "task_loss": 0.05863397940993309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20193368196487427, + "epoch": 5.92, + "learning_rate": 7.698345610433523e-06, + "loss": 0.4803, + "step": 7006, + "task_loss": 0.04005083814263344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5419428944587708, + "epoch": 5.92, + "learning_rate": 7.692307692307694e-06, + "loss": 0.4819, + "step": 7007, + "task_loss": 0.8839797973632812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49520108103752136, + "epoch": 5.92, + "learning_rate": 7.686269774181862e-06, + "loss": 0.5003, + "step": 7008, + "task_loss": 0.9040836691856384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24556726217269897, + "epoch": 5.92, + "learning_rate": 7.680231856056031e-06, + "loss": 0.3732, + "step": 7009, + "task_loss": 0.4811991751194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7885105609893799, + "epoch": 5.93, + "learning_rate": 7.674193937930202e-06, + "loss": 0.435, + "step": 7010, + "task_loss": 1.1441720724105835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32612931728363037, + "epoch": 5.93, + "learning_rate": 7.668156019804372e-06, + "loss": 0.3565, + "step": 7011, + "task_loss": 0.26004287600517273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3905634880065918, + "epoch": 5.93, + "learning_rate": 7.662118101678541e-06, + "loss": 0.4717, + "step": 7012, + "task_loss": 0.5330042243003845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44006723165512085, + "epoch": 5.93, + "learning_rate": 7.65608018355271e-06, + "loss": 0.4301, + "step": 7013, + "task_loss": 0.8790510892868042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5632410049438477, + "epoch": 5.93, + "learning_rate": 7.65004226542688e-06, + "loss": 0.4422, + "step": 7014, + "task_loss": 0.7673523426055908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4600031077861786, + "epoch": 5.93, + "learning_rate": 7.644004347301051e-06, + "loss": 0.602, + "step": 7015, + "task_loss": 0.34858885407447815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3741466999053955, + "epoch": 5.93, + "learning_rate": 7.63796642917522e-06, + "loss": 0.5048, + "step": 7016, + "task_loss": 0.9648093581199646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5596623420715332, + "epoch": 5.93, + "learning_rate": 7.63192851104939e-06, + "loss": 0.4372, + "step": 7017, + "task_loss": 0.593818187713623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8328822255134583, + "epoch": 5.93, + "learning_rate": 7.62589059292356e-06, + "loss": 0.6445, + "step": 7018, + "task_loss": 0.7616930603981018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21878403425216675, + "epoch": 5.93, + "learning_rate": 7.6198526747977294e-06, + "loss": 0.3451, + "step": 7019, + "task_loss": 0.7419448494911194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3129991292953491, + "epoch": 5.93, + "learning_rate": 7.6138147566719e-06, + "loss": 0.3389, + "step": 7020, + "task_loss": 0.3373315930366516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42461833357810974, + "epoch": 5.93, + "learning_rate": 7.60777683854607e-06, + "loss": 0.3885, + "step": 7021, + "task_loss": 0.3851896822452545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3396170735359192, + "epoch": 5.94, + "learning_rate": 7.60173892042024e-06, + "loss": 0.4242, + "step": 7022, + "task_loss": 0.6543120741844177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3324466943740845, + "epoch": 5.94, + "learning_rate": 7.595701002294408e-06, + "loss": 0.4188, + "step": 7023, + "task_loss": 0.6630089282989502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37620246410369873, + "epoch": 5.94, + "learning_rate": 7.589663084168579e-06, + "loss": 0.3767, + "step": 7024, + "task_loss": 0.5533791780471802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4594841003417969, + "epoch": 5.94, + "learning_rate": 7.583625166042749e-06, + "loss": 0.4321, + "step": 7025, + "task_loss": 0.19519706070423126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3487852215766907, + "epoch": 5.94, + "learning_rate": 7.577587247916919e-06, + "loss": 0.5114, + "step": 7026, + "task_loss": 0.11900435388088226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5019280910491943, + "epoch": 5.94, + "learning_rate": 7.571549329791089e-06, + "loss": 0.3939, + "step": 7027, + "task_loss": 1.2152637243270874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4043170213699341, + "epoch": 5.94, + "learning_rate": 7.565511411665258e-06, + "loss": 0.5566, + "step": 7028, + "task_loss": 0.7055994272232056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40806329250335693, + "epoch": 5.94, + "learning_rate": 7.5594734935394275e-06, + "loss": 0.3817, + "step": 7029, + "task_loss": 0.844902753829956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.302229642868042, + "epoch": 5.94, + "learning_rate": 7.553435575413598e-06, + "loss": 0.4131, + "step": 7030, + "task_loss": 0.5857746005058289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38478633761405945, + "epoch": 5.94, + "learning_rate": 7.547397657287768e-06, + "loss": 0.5197, + "step": 7031, + "task_loss": 0.8376520276069641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5111440420150757, + "epoch": 5.94, + "learning_rate": 7.5413597391619375e-06, + "loss": 0.4412, + "step": 7032, + "task_loss": 1.3450348377227783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39812418818473816, + "epoch": 5.94, + "learning_rate": 7.535321821036106e-06, + "loss": 0.4569, + "step": 7033, + "task_loss": 0.999010443687439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3072189688682556, + "epoch": 5.95, + "learning_rate": 7.529283902910277e-06, + "loss": 0.388, + "step": 7034, + "task_loss": 0.7049151659011841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4906582236289978, + "epoch": 5.95, + "learning_rate": 7.523245984784447e-06, + "loss": 0.3986, + "step": 7035, + "task_loss": 0.19671744108200073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47010916471481323, + "epoch": 5.95, + "learning_rate": 7.5172080666586164e-06, + "loss": 0.531, + "step": 7036, + "task_loss": 0.5344096422195435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4831275939941406, + "epoch": 5.95, + "learning_rate": 7.511170148532787e-06, + "loss": 0.5177, + "step": 7037, + "task_loss": 0.6647403836250305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3792143166065216, + "epoch": 5.95, + "learning_rate": 7.505132230406955e-06, + "loss": 0.3463, + "step": 7038, + "task_loss": 0.8794869780540466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27047115564346313, + "epoch": 5.95, + "learning_rate": 7.499094312281126e-06, + "loss": 0.4287, + "step": 7039, + "task_loss": 1.5826219320297241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4534704089164734, + "epoch": 5.95, + "learning_rate": 7.493056394155295e-06, + "loss": 0.4796, + "step": 7040, + "task_loss": 0.3871309757232666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6455046534538269, + "epoch": 5.95, + "learning_rate": 7.487018476029466e-06, + "loss": 0.4326, + "step": 7041, + "task_loss": 0.22206416726112366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49456316232681274, + "epoch": 5.95, + "learning_rate": 7.480980557903636e-06, + "loss": 0.4727, + "step": 7042, + "task_loss": 0.35555681586265564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5057169198989868, + "epoch": 5.95, + "learning_rate": 7.4749426397778045e-06, + "loss": 0.4278, + "step": 7043, + "task_loss": 1.2687804698944092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3873075842857361, + "epoch": 5.95, + "learning_rate": 7.468904721651974e-06, + "loss": 0.4967, + "step": 7044, + "task_loss": 0.7278705835342407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21768298745155334, + "epoch": 5.95, + "learning_rate": 7.462866803526145e-06, + "loss": 0.3061, + "step": 7045, + "task_loss": 0.6389554142951965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27365902066230774, + "epoch": 5.96, + "learning_rate": 7.4568288854003145e-06, + "loss": 0.3973, + "step": 7046, + "task_loss": 0.07726956903934479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5594854354858398, + "epoch": 5.96, + "learning_rate": 7.450790967274484e-06, + "loss": 0.4927, + "step": 7047, + "task_loss": 0.8201348781585693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41983649134635925, + "epoch": 5.96, + "learning_rate": 7.444753049148653e-06, + "loss": 0.3994, + "step": 7048, + "task_loss": 0.9367262721061707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4170165956020355, + "epoch": 5.96, + "learning_rate": 7.438715131022824e-06, + "loss": 0.4594, + "step": 7049, + "task_loss": 0.6192636489868164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42109233140945435, + "epoch": 5.96, + "learning_rate": 7.432677212896993e-06, + "loss": 0.3542, + "step": 7050, + "task_loss": 1.2949645519256592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5134333968162537, + "epoch": 5.96, + "learning_rate": 7.426639294771163e-06, + "loss": 0.4059, + "step": 7051, + "task_loss": 0.2380795180797577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.507244884967804, + "epoch": 5.96, + "learning_rate": 7.420601376645334e-06, + "loss": 0.4131, + "step": 7052, + "task_loss": 0.832607626914978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5876025557518005, + "epoch": 5.96, + "learning_rate": 7.414563458519503e-06, + "loss": 0.4915, + "step": 7053, + "task_loss": 0.5305912494659424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44183656573295593, + "epoch": 5.96, + "learning_rate": 7.408525540393672e-06, + "loss": 0.3622, + "step": 7054, + "task_loss": 1.790590524673462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4473643898963928, + "epoch": 5.96, + "learning_rate": 7.402487622267842e-06, + "loss": 0.5364, + "step": 7055, + "task_loss": 0.7588744759559631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2649776339530945, + "epoch": 5.96, + "learning_rate": 7.396449704142013e-06, + "loss": 0.439, + "step": 7056, + "task_loss": 0.284032940864563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29009222984313965, + "epoch": 5.96, + "learning_rate": 7.390411786016182e-06, + "loss": 0.4439, + "step": 7057, + "task_loss": 0.7101868391036987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30446457862854004, + "epoch": 5.97, + "learning_rate": 7.384373867890351e-06, + "loss": 0.412, + "step": 7058, + "task_loss": 0.5937415361404419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5244344472885132, + "epoch": 5.97, + "learning_rate": 7.378335949764521e-06, + "loss": 0.4599, + "step": 7059, + "task_loss": 0.7775442004203796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24270044267177582, + "epoch": 5.97, + "learning_rate": 7.3722980316386915e-06, + "loss": 0.3707, + "step": 7060, + "task_loss": 0.8214513063430786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.670659065246582, + "epoch": 5.97, + "learning_rate": 7.366260113512861e-06, + "loss": 0.5485, + "step": 7061, + "task_loss": 1.0849199295043945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4346197247505188, + "epoch": 5.97, + "learning_rate": 7.360222195387032e-06, + "loss": 0.6423, + "step": 7062, + "task_loss": 0.3534286320209503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2518426179885864, + "epoch": 5.97, + "learning_rate": 7.3541842772612e-06, + "loss": 0.3991, + "step": 7063, + "task_loss": 0.11680903285741806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28752532601356506, + "epoch": 5.97, + "learning_rate": 7.34814635913537e-06, + "loss": 0.3104, + "step": 7064, + "task_loss": 0.3754250407218933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4494609236717224, + "epoch": 5.97, + "learning_rate": 7.34210844100954e-06, + "loss": 0.6208, + "step": 7065, + "task_loss": 0.899495005607605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34619009494781494, + "epoch": 5.97, + "learning_rate": 7.336070522883711e-06, + "loss": 0.3983, + "step": 7066, + "task_loss": 0.23274581134319305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5593941807746887, + "epoch": 5.97, + "learning_rate": 7.33003260475788e-06, + "loss": 0.4632, + "step": 7067, + "task_loss": 0.7184323072433472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5535835027694702, + "epoch": 5.97, + "learning_rate": 7.323994686632049e-06, + "loss": 0.3368, + "step": 7068, + "task_loss": 0.7134178876876831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5950720906257629, + "epoch": 5.97, + "learning_rate": 7.317956768506219e-06, + "loss": 0.4394, + "step": 7069, + "task_loss": 1.1516510248184204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3190397024154663, + "epoch": 5.98, + "learning_rate": 7.31191885038039e-06, + "loss": 0.411, + "step": 7070, + "task_loss": 0.2600443363189697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5478578209877014, + "epoch": 5.98, + "learning_rate": 7.305880932254559e-06, + "loss": 0.4204, + "step": 7071, + "task_loss": 1.0017175674438477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.334171324968338, + "epoch": 5.98, + "learning_rate": 7.299843014128729e-06, + "loss": 0.3976, + "step": 7072, + "task_loss": 0.9251954555511475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3606780171394348, + "epoch": 5.98, + "learning_rate": 7.293805096002898e-06, + "loss": 0.5004, + "step": 7073, + "task_loss": 0.49163466691970825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28308412432670593, + "epoch": 5.98, + "learning_rate": 7.2877671778770685e-06, + "loss": 0.3652, + "step": 7074, + "task_loss": 0.28958550095558167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36416882276535034, + "epoch": 5.98, + "learning_rate": 7.281729259751238e-06, + "loss": 0.4951, + "step": 7075, + "task_loss": 0.8822728991508484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4413394629955292, + "epoch": 5.98, + "learning_rate": 7.275691341625408e-06, + "loss": 0.3357, + "step": 7076, + "task_loss": 0.5711826086044312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27178695797920227, + "epoch": 5.98, + "learning_rate": 7.2696534234995785e-06, + "loss": 0.4049, + "step": 7077, + "task_loss": 0.4255422353744507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2500690817832947, + "epoch": 5.98, + "learning_rate": 7.263615505373747e-06, + "loss": 0.3383, + "step": 7078, + "task_loss": 0.180959552526474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3031746745109558, + "epoch": 5.98, + "learning_rate": 7.257577587247917e-06, + "loss": 0.3904, + "step": 7079, + "task_loss": 0.83329176902771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3383169174194336, + "epoch": 5.98, + "learning_rate": 7.251539669122087e-06, + "loss": 0.489, + "step": 7080, + "task_loss": 0.6522268652915955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5693873167037964, + "epoch": 5.99, + "learning_rate": 7.245501750996257e-06, + "loss": 0.3995, + "step": 7081, + "task_loss": 0.6012743711471558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39242082834243774, + "epoch": 5.99, + "learning_rate": 7.239463832870427e-06, + "loss": 0.2985, + "step": 7082, + "task_loss": 0.6262942552566528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5861955881118774, + "epoch": 5.99, + "learning_rate": 7.233425914744596e-06, + "loss": 0.4791, + "step": 7083, + "task_loss": 0.7192972302436829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6567158699035645, + "epoch": 5.99, + "learning_rate": 7.227387996618766e-06, + "loss": 0.528, + "step": 7084, + "task_loss": 0.5775797367095947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22389966249465942, + "epoch": 5.99, + "learning_rate": 7.221350078492936e-06, + "loss": 0.3791, + "step": 7085, + "task_loss": 0.9731560945510864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2128923386335373, + "epoch": 5.99, + "learning_rate": 7.215312160367106e-06, + "loss": 0.3307, + "step": 7086, + "task_loss": 0.7405011653900146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6503626704216003, + "epoch": 5.99, + "learning_rate": 7.209274242241277e-06, + "loss": 0.4312, + "step": 7087, + "task_loss": 0.7636539936065674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.11323842406272888, + "epoch": 5.99, + "learning_rate": 7.203236324115445e-06, + "loss": 0.3105, + "step": 7088, + "task_loss": 0.2489795982837677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8654056787490845, + "epoch": 5.99, + "learning_rate": 7.197198405989615e-06, + "loss": 0.6491, + "step": 7089, + "task_loss": 1.5682518482208252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35108596086502075, + "epoch": 5.99, + "learning_rate": 7.191160487863785e-06, + "loss": 0.3238, + "step": 7090, + "task_loss": 0.4787486791610718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45431530475616455, + "epoch": 5.99, + "learning_rate": 7.1851225697379555e-06, + "loss": 0.4341, + "step": 7091, + "task_loss": 0.7081970572471619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5746167898178101, + "epoch": 5.99, + "learning_rate": 7.1790846516121235e-06, + "loss": 0.5528, + "step": 7092, + "task_loss": 1.1433279514312744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4582858383655548, + "epoch": 6.0, + "learning_rate": 7.173046733486294e-06, + "loss": 0.4393, + "step": 7093, + "task_loss": 0.3183719217777252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.510151207447052, + "epoch": 6.0, + "learning_rate": 7.167008815360464e-06, + "loss": 0.4725, + "step": 7094, + "task_loss": 0.8925796747207642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2868543863296509, + "epoch": 6.0, + "learning_rate": 7.160970897234634e-06, + "loss": 0.4292, + "step": 7095, + "task_loss": 0.2884525656700134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28785791993141174, + "epoch": 6.0, + "learning_rate": 7.154932979108804e-06, + "loss": 0.4467, + "step": 7096, + "task_loss": 0.7509374618530273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4452590346336365, + "epoch": 6.0, + "learning_rate": 7.148895060982973e-06, + "loss": 0.511, + "step": 7097, + "task_loss": 0.7346837520599365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33730486035346985, + "epoch": 6.0, + "learning_rate": 7.142857142857143e-06, + "loss": 0.517, + "step": 7098, + "task_loss": 0.965241014957428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5262080430984497, + "epoch": 6.0, + "learning_rate": 7.136819224731313e-06, + "loss": 0.9397, + "step": 7099, + "task_loss": 0.3902360796928406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5188996195793152, + "epoch": 6.0, + "learning_rate": 7.130781306605483e-06, + "loss": 0.4016, + "step": 7100, + "task_loss": 0.5250151753425598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36875835061073303, + "epoch": 6.0, + "learning_rate": 7.124743388479653e-06, + "loss": 0.4149, + "step": 7101, + "task_loss": 0.4496521055698395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.251220703125, + "epoch": 6.0, + "learning_rate": 7.118705470353822e-06, + "loss": 0.4304, + "step": 7102, + "task_loss": 0.43171605467796326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2986379861831665, + "epoch": 6.0, + "learning_rate": 7.112667552227991e-06, + "loss": 0.433, + "step": 7103, + "task_loss": 0.47165364027023315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45028892159461975, + "epoch": 6.01, + "learning_rate": 7.106629634102162e-06, + "loss": 0.5214, + "step": 7104, + "task_loss": 0.8099179267883301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38340699672698975, + "epoch": 6.01, + "learning_rate": 7.100591715976332e-06, + "loss": 0.4698, + "step": 7105, + "task_loss": 0.3486477732658386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3130529522895813, + "epoch": 6.01, + "learning_rate": 7.094553797850502e-06, + "loss": 0.3593, + "step": 7106, + "task_loss": 0.2623346745967865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7038084864616394, + "epoch": 6.01, + "learning_rate": 7.08851587972467e-06, + "loss": 0.4758, + "step": 7107, + "task_loss": 0.6391435861587524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46086493134498596, + "epoch": 6.01, + "learning_rate": 7.082477961598841e-06, + "loss": 0.4552, + "step": 7108, + "task_loss": 0.647635817527771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5871177911758423, + "epoch": 6.01, + "learning_rate": 7.0764400434730105e-06, + "loss": 0.4943, + "step": 7109, + "task_loss": 1.8093488216400146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8817994594573975, + "epoch": 6.01, + "learning_rate": 7.070402125347181e-06, + "loss": 0.5513, + "step": 7110, + "task_loss": 0.44257140159606934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6310312747955322, + "epoch": 6.01, + "learning_rate": 7.064364207221351e-06, + "loss": 0.6963, + "step": 7111, + "task_loss": 0.5926650166511536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3316280245780945, + "epoch": 6.01, + "learning_rate": 7.05832628909552e-06, + "loss": 0.3539, + "step": 7112, + "task_loss": 0.21706748008728027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5676208734512329, + "epoch": 6.01, + "learning_rate": 7.0522883709696894e-06, + "loss": 0.4231, + "step": 7113, + "task_loss": 1.0569360256195068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4290180504322052, + "epoch": 6.01, + "learning_rate": 7.04625045284386e-06, + "loss": 0.477, + "step": 7114, + "task_loss": 0.3144263029098511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.15094241499900818, + "epoch": 6.01, + "learning_rate": 7.04021253471803e-06, + "loss": 0.2928, + "step": 7115, + "task_loss": 0.0428164042532444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6265444755554199, + "epoch": 6.02, + "learning_rate": 7.0341746165921994e-06, + "loss": 0.5195, + "step": 7116, + "task_loss": 0.41608041524887085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34241437911987305, + "epoch": 6.02, + "learning_rate": 7.028136698466368e-06, + "loss": 0.339, + "step": 7117, + "task_loss": 0.7177985310554504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4421979486942291, + "epoch": 6.02, + "learning_rate": 7.022098780340539e-06, + "loss": 0.3941, + "step": 7118, + "task_loss": 0.18723012506961823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32216769456863403, + "epoch": 6.02, + "learning_rate": 7.016060862214709e-06, + "loss": 0.3952, + "step": 7119, + "task_loss": 0.47555041313171387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3246784210205078, + "epoch": 6.02, + "learning_rate": 7.010022944088878e-06, + "loss": 0.4045, + "step": 7120, + "task_loss": 0.6870850324630737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7007710337638855, + "epoch": 6.02, + "learning_rate": 7.003985025963049e-06, + "loss": 0.4158, + "step": 7121, + "task_loss": 0.7366905808448792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5591539740562439, + "epoch": 6.02, + "learning_rate": 6.997947107837218e-06, + "loss": 0.3927, + "step": 7122, + "task_loss": 1.010343074798584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.434347927570343, + "epoch": 6.02, + "learning_rate": 6.9919091897113875e-06, + "loss": 0.4522, + "step": 7123, + "task_loss": 0.6980617642402649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32159489393234253, + "epoch": 6.02, + "learning_rate": 6.985871271585557e-06, + "loss": 0.4841, + "step": 7124, + "task_loss": 0.5612009763717651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.376287579536438, + "epoch": 6.02, + "learning_rate": 6.979833353459728e-06, + "loss": 0.4228, + "step": 7125, + "task_loss": 0.8765881657600403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3521145284175873, + "epoch": 6.02, + "learning_rate": 6.9737954353338975e-06, + "loss": 0.3515, + "step": 7126, + "task_loss": 1.162037968635559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3611052334308624, + "epoch": 6.02, + "learning_rate": 6.967757517208066e-06, + "loss": 0.3082, + "step": 7127, + "task_loss": 0.404824823141098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6239032745361328, + "epoch": 6.03, + "learning_rate": 6.961719599082236e-06, + "loss": 0.5001, + "step": 7128, + "task_loss": 0.5826499462127686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33717507123947144, + "epoch": 6.03, + "learning_rate": 6.955681680956407e-06, + "loss": 0.4948, + "step": 7129, + "task_loss": 0.45620113611221313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.223327174782753, + "epoch": 6.03, + "learning_rate": 6.9496437628305764e-06, + "loss": 0.4137, + "step": 7130, + "task_loss": 0.4199458658695221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2688449025154114, + "epoch": 6.03, + "learning_rate": 6.943605844704747e-06, + "loss": 0.5287, + "step": 7131, + "task_loss": 0.1741996556520462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3611210286617279, + "epoch": 6.03, + "learning_rate": 6.937567926578915e-06, + "loss": 0.2833, + "step": 7132, + "task_loss": 0.4573085904121399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.18000757694244385, + "epoch": 6.03, + "learning_rate": 6.931530008453086e-06, + "loss": 0.4284, + "step": 7133, + "task_loss": 0.4397300183773041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3811725974082947, + "epoch": 6.03, + "learning_rate": 6.925492090327255e-06, + "loss": 0.4054, + "step": 7134, + "task_loss": 0.34527382254600525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37422382831573486, + "epoch": 6.03, + "learning_rate": 6.919454172201426e-06, + "loss": 0.4071, + "step": 7135, + "task_loss": 0.5344712734222412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48744499683380127, + "epoch": 6.03, + "learning_rate": 6.913416254075596e-06, + "loss": 0.3935, + "step": 7136, + "task_loss": 0.44829416275024414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4132014811038971, + "epoch": 6.03, + "learning_rate": 6.9073783359497645e-06, + "loss": 0.4354, + "step": 7137, + "task_loss": 0.9007019996643066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5106713175773621, + "epoch": 6.03, + "learning_rate": 6.901340417823934e-06, + "loss": 0.4397, + "step": 7138, + "task_loss": 0.5553090572357178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3593030273914337, + "epoch": 6.03, + "learning_rate": 6.895302499698105e-06, + "loss": 0.3602, + "step": 7139, + "task_loss": 0.6285237073898315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4235730767250061, + "epoch": 6.04, + "learning_rate": 6.8892645815722745e-06, + "loss": 0.4447, + "step": 7140, + "task_loss": 0.4478953182697296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30977529287338257, + "epoch": 6.04, + "learning_rate": 6.883226663446444e-06, + "loss": 0.4706, + "step": 7141, + "task_loss": 0.2656865119934082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3857533633708954, + "epoch": 6.04, + "learning_rate": 6.877188745320613e-06, + "loss": 0.5095, + "step": 7142, + "task_loss": 0.420479953289032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4781047999858856, + "epoch": 6.04, + "learning_rate": 6.871150827194784e-06, + "loss": 0.3935, + "step": 7143, + "task_loss": 0.5897602438926697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48093336820602417, + "epoch": 6.04, + "learning_rate": 6.865112909068953e-06, + "loss": 0.4192, + "step": 7144, + "task_loss": 0.6875292062759399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5217198133468628, + "epoch": 6.04, + "learning_rate": 6.859074990943123e-06, + "loss": 0.4914, + "step": 7145, + "task_loss": 1.0731172561645508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49561771750450134, + "epoch": 6.04, + "learning_rate": 6.853037072817294e-06, + "loss": 0.5129, + "step": 7146, + "task_loss": 0.7291666865348816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4369674324989319, + "epoch": 6.04, + "learning_rate": 6.846999154691463e-06, + "loss": 0.4054, + "step": 7147, + "task_loss": 0.9252296090126038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29612022638320923, + "epoch": 6.04, + "learning_rate": 6.840961236565632e-06, + "loss": 0.3802, + "step": 7148, + "task_loss": 0.43354177474975586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4211171269416809, + "epoch": 6.04, + "learning_rate": 6.834923318439802e-06, + "loss": 0.4061, + "step": 7149, + "task_loss": 0.8335044980049133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35648655891418457, + "epoch": 6.04, + "learning_rate": 6.828885400313973e-06, + "loss": 0.4281, + "step": 7150, + "task_loss": 0.5089904069900513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3265579640865326, + "epoch": 6.04, + "learning_rate": 6.822847482188142e-06, + "loss": 0.3215, + "step": 7151, + "task_loss": 0.3704642057418823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5553097128868103, + "epoch": 6.05, + "learning_rate": 6.816809564062311e-06, + "loss": 0.4735, + "step": 7152, + "task_loss": 1.075695514678955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2504265308380127, + "epoch": 6.05, + "learning_rate": 6.810771645936481e-06, + "loss": 0.3984, + "step": 7153, + "task_loss": 0.03829047828912735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29096919298171997, + "epoch": 6.05, + "learning_rate": 6.8047337278106515e-06, + "loss": 0.4162, + "step": 7154, + "task_loss": 0.144149512052536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.734687328338623, + "epoch": 6.05, + "learning_rate": 6.798695809684821e-06, + "loss": 0.5563, + "step": 7155, + "task_loss": 0.2257729321718216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6400864720344543, + "epoch": 6.05, + "learning_rate": 6.792657891558992e-06, + "loss": 0.6251, + "step": 7156, + "task_loss": 0.8936904668807983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3554542660713196, + "epoch": 6.05, + "learning_rate": 6.78661997343316e-06, + "loss": 0.4187, + "step": 7157, + "task_loss": 0.327172189950943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29771751165390015, + "epoch": 6.05, + "learning_rate": 6.78058205530733e-06, + "loss": 0.3586, + "step": 7158, + "task_loss": 0.35997769236564636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4271741509437561, + "epoch": 6.05, + "learning_rate": 6.7745441371815e-06, + "loss": 0.4096, + "step": 7159, + "task_loss": 0.46357718110084534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26467353105545044, + "epoch": 6.05, + "learning_rate": 6.768506219055671e-06, + "loss": 0.4243, + "step": 7160, + "task_loss": 0.17903000116348267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35673514008522034, + "epoch": 6.05, + "learning_rate": 6.76246830092984e-06, + "loss": 0.4081, + "step": 7161, + "task_loss": 0.9858098030090332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41611674427986145, + "epoch": 6.05, + "learning_rate": 6.756430382804009e-06, + "loss": 0.4238, + "step": 7162, + "task_loss": 0.32074862718582153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.371525377035141, + "epoch": 6.05, + "learning_rate": 6.750392464678179e-06, + "loss": 0.3966, + "step": 7163, + "task_loss": 0.5520741939544678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2822134494781494, + "epoch": 6.06, + "learning_rate": 6.744354546552349e-06, + "loss": 0.4649, + "step": 7164, + "task_loss": 0.5250979065895081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26244956254959106, + "epoch": 6.06, + "learning_rate": 6.738316628426519e-06, + "loss": 0.4808, + "step": 7165, + "task_loss": 0.7779847383499146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2613524794578552, + "epoch": 6.06, + "learning_rate": 6.732278710300689e-06, + "loss": 0.3055, + "step": 7166, + "task_loss": 0.06769565492868423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3707756996154785, + "epoch": 6.06, + "learning_rate": 6.726240792174858e-06, + "loss": 0.5357, + "step": 7167, + "task_loss": 0.07357662171125412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.353291392326355, + "epoch": 6.06, + "learning_rate": 6.720202874049028e-06, + "loss": 0.4597, + "step": 7168, + "task_loss": 0.1118333637714386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2877667546272278, + "epoch": 6.06, + "learning_rate": 6.714164955923198e-06, + "loss": 0.4445, + "step": 7169, + "task_loss": 0.35710740089416504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35396718978881836, + "epoch": 6.06, + "learning_rate": 6.708127037797368e-06, + "loss": 0.3683, + "step": 7170, + "task_loss": 0.2976921498775482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4750630855560303, + "epoch": 6.06, + "learning_rate": 6.7020891196715385e-06, + "loss": 0.4242, + "step": 7171, + "task_loss": 0.726797878742218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37916404008865356, + "epoch": 6.06, + "learning_rate": 6.6960512015457065e-06, + "loss": 0.4347, + "step": 7172, + "task_loss": 0.2923533022403717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3319649398326874, + "epoch": 6.06, + "learning_rate": 6.690013283419877e-06, + "loss": 0.5594, + "step": 7173, + "task_loss": 1.6473699808120728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36475253105163574, + "epoch": 6.06, + "learning_rate": 6.683975365294047e-06, + "loss": 0.6092, + "step": 7174, + "task_loss": 0.8801401853561401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6032578349113464, + "epoch": 6.07, + "learning_rate": 6.677937447168217e-06, + "loss": 0.5348, + "step": 7175, + "task_loss": 0.7852023839950562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6037065982818604, + "epoch": 6.07, + "learning_rate": 6.671899529042387e-06, + "loss": 0.4547, + "step": 7176, + "task_loss": 0.3197253942489624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27962881326675415, + "epoch": 6.07, + "learning_rate": 6.665861610916556e-06, + "loss": 0.4451, + "step": 7177, + "task_loss": 0.07280945032835007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4126201868057251, + "epoch": 6.07, + "learning_rate": 6.659823692790726e-06, + "loss": 0.4222, + "step": 7178, + "task_loss": 0.5707032680511475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.253854900598526, + "epoch": 6.07, + "learning_rate": 6.653785774664896e-06, + "loss": 0.3142, + "step": 7179, + "task_loss": 0.8102204203605652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4161665439605713, + "epoch": 6.07, + "learning_rate": 6.647747856539066e-06, + "loss": 0.4745, + "step": 7180, + "task_loss": 1.3409544229507446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5477035045623779, + "epoch": 6.07, + "learning_rate": 6.641709938413235e-06, + "loss": 0.4311, + "step": 7181, + "task_loss": 0.7404505014419556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5668452382087708, + "epoch": 6.07, + "learning_rate": 6.635672020287405e-06, + "loss": 0.4508, + "step": 7182, + "task_loss": 0.8325549364089966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22278167307376862, + "epoch": 6.07, + "learning_rate": 6.629634102161575e-06, + "loss": 0.3544, + "step": 7183, + "task_loss": 0.08417687565088272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41919636726379395, + "epoch": 6.07, + "learning_rate": 6.623596184035745e-06, + "loss": 0.5219, + "step": 7184, + "task_loss": 0.24531899392604828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3704850375652313, + "epoch": 6.07, + "learning_rate": 6.617558265909915e-06, + "loss": 0.4751, + "step": 7185, + "task_loss": 0.6573542356491089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4520191252231598, + "epoch": 6.07, + "learning_rate": 6.6115203477840835e-06, + "loss": 0.3109, + "step": 7186, + "task_loss": 0.5783968567848206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21963025629520416, + "epoch": 6.08, + "learning_rate": 6.605482429658254e-06, + "loss": 0.4426, + "step": 7187, + "task_loss": 0.29841774702072144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6478004455566406, + "epoch": 6.08, + "learning_rate": 6.599444511532424e-06, + "loss": 0.5063, + "step": 7188, + "task_loss": 1.5139487981796265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45501431822776794, + "epoch": 6.08, + "learning_rate": 6.5934065934065935e-06, + "loss": 0.4693, + "step": 7189, + "task_loss": 1.0436865091323853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6331683397293091, + "epoch": 6.08, + "learning_rate": 6.587368675280764e-06, + "loss": 0.4353, + "step": 7190, + "task_loss": 1.1692107915878296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35024380683898926, + "epoch": 6.08, + "learning_rate": 6.581330757154933e-06, + "loss": 0.3994, + "step": 7191, + "task_loss": 0.4679393768310547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5039750337600708, + "epoch": 6.08, + "learning_rate": 6.575292839029103e-06, + "loss": 0.4574, + "step": 7192, + "task_loss": 0.6149628758430481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5373827815055847, + "epoch": 6.08, + "learning_rate": 6.5692549209032724e-06, + "loss": 0.4481, + "step": 7193, + "task_loss": 1.1550272703170776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2818429470062256, + "epoch": 6.08, + "learning_rate": 6.563217002777443e-06, + "loss": 0.4204, + "step": 7194, + "task_loss": 0.3789758086204529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33568838238716125, + "epoch": 6.08, + "learning_rate": 6.557179084651613e-06, + "loss": 0.3075, + "step": 7195, + "task_loss": 0.23927980661392212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4208991825580597, + "epoch": 6.08, + "learning_rate": 6.551141166525782e-06, + "loss": 0.4062, + "step": 7196, + "task_loss": 1.020880937576294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3647145926952362, + "epoch": 6.08, + "learning_rate": 6.545103248399951e-06, + "loss": 0.3981, + "step": 7197, + "task_loss": 0.5236802697181702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2462729662656784, + "epoch": 6.08, + "learning_rate": 6.539065330274122e-06, + "loss": 0.3264, + "step": 7198, + "task_loss": 0.44931530952453613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4583521783351898, + "epoch": 6.09, + "learning_rate": 6.533027412148292e-06, + "loss": 0.4561, + "step": 7199, + "task_loss": 0.8439003825187683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.15079770982265472, + "epoch": 6.09, + "learning_rate": 6.526989494022462e-06, + "loss": 0.3586, + "step": 7200, + "task_loss": 0.6448714733123779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30904316902160645, + "epoch": 6.09, + "learning_rate": 6.52095157589663e-06, + "loss": 0.5907, + "step": 7201, + "task_loss": 0.2571970224380493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3346657156944275, + "epoch": 6.09, + "learning_rate": 6.514913657770801e-06, + "loss": 0.3241, + "step": 7202, + "task_loss": 0.1623639166355133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36350733041763306, + "epoch": 6.09, + "learning_rate": 6.5088757396449705e-06, + "loss": 0.3813, + "step": 7203, + "task_loss": 0.7544834017753601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2563495337963104, + "epoch": 6.09, + "learning_rate": 6.502837821519141e-06, + "loss": 0.396, + "step": 7204, + "task_loss": 0.7555733323097229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39668601751327515, + "epoch": 6.09, + "learning_rate": 6.496799903393311e-06, + "loss": 0.5621, + "step": 7205, + "task_loss": 0.28675737977027893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4389287829399109, + "epoch": 6.09, + "learning_rate": 6.49076198526748e-06, + "loss": 0.4949, + "step": 7206, + "task_loss": 1.0579266548156738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.367116779088974, + "epoch": 6.09, + "learning_rate": 6.484724067141649e-06, + "loss": 0.3827, + "step": 7207, + "task_loss": 0.10323720425367355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19079753756523132, + "epoch": 6.09, + "learning_rate": 6.47868614901582e-06, + "loss": 0.3817, + "step": 7208, + "task_loss": 0.8882685303688049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4799621105194092, + "epoch": 6.09, + "learning_rate": 6.47264823088999e-06, + "loss": 0.3971, + "step": 7209, + "task_loss": 0.3712241053581238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2461603730916977, + "epoch": 6.09, + "learning_rate": 6.4666103127641594e-06, + "loss": 0.3244, + "step": 7210, + "task_loss": 0.5583053827285767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5292807817459106, + "epoch": 6.1, + "learning_rate": 6.460572394638328e-06, + "loss": 0.4817, + "step": 7211, + "task_loss": 1.0666821002960205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.341541588306427, + "epoch": 6.1, + "learning_rate": 6.454534476512499e-06, + "loss": 0.4323, + "step": 7212, + "task_loss": 0.02787797898054123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38108378648757935, + "epoch": 6.1, + "learning_rate": 6.448496558386669e-06, + "loss": 0.3863, + "step": 7213, + "task_loss": 0.33153119683265686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37953752279281616, + "epoch": 6.1, + "learning_rate": 6.442458640260838e-06, + "loss": 0.4349, + "step": 7214, + "task_loss": 0.2721627950668335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30654704570770264, + "epoch": 6.1, + "learning_rate": 6.436420722135009e-06, + "loss": 0.3059, + "step": 7215, + "task_loss": 0.2704087793827057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5520497560501099, + "epoch": 6.1, + "learning_rate": 6.430382804009178e-06, + "loss": 0.4473, + "step": 7216, + "task_loss": 0.7592300176620483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5831268429756165, + "epoch": 6.1, + "learning_rate": 6.4243448858833475e-06, + "loss": 0.4839, + "step": 7217, + "task_loss": 0.8331745862960815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7392500042915344, + "epoch": 6.1, + "learning_rate": 6.418306967757517e-06, + "loss": 0.4454, + "step": 7218, + "task_loss": 0.5427871942520142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30849045515060425, + "epoch": 6.1, + "learning_rate": 6.412269049631688e-06, + "loss": 0.4129, + "step": 7219, + "task_loss": 1.013205647468567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43472176790237427, + "epoch": 6.1, + "learning_rate": 6.4062311315058575e-06, + "loss": 0.429, + "step": 7220, + "task_loss": 0.9005637168884277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4241586923599243, + "epoch": 6.1, + "learning_rate": 6.400193213380026e-06, + "loss": 0.3496, + "step": 7221, + "task_loss": 0.39624297618865967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19200864434242249, + "epoch": 6.1, + "learning_rate": 6.394155295254196e-06, + "loss": 0.3586, + "step": 7222, + "task_loss": 0.028283827006816864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3397844731807709, + "epoch": 6.11, + "learning_rate": 6.388117377128367e-06, + "loss": 0.5111, + "step": 7223, + "task_loss": 0.6029148101806641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34376752376556396, + "epoch": 6.11, + "learning_rate": 6.382079459002536e-06, + "loss": 0.4528, + "step": 7224, + "task_loss": 0.931618332862854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33462175726890564, + "epoch": 6.11, + "learning_rate": 6.376041540876707e-06, + "loss": 0.3314, + "step": 7225, + "task_loss": 0.958778440952301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32678845524787903, + "epoch": 6.11, + "learning_rate": 6.370003622750875e-06, + "loss": 0.4355, + "step": 7226, + "task_loss": 0.13563141226768494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.584662675857544, + "epoch": 6.11, + "learning_rate": 6.363965704625046e-06, + "loss": 0.4765, + "step": 7227, + "task_loss": 0.44774317741394043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4314029812812805, + "epoch": 6.11, + "learning_rate": 6.357927786499215e-06, + "loss": 0.4261, + "step": 7228, + "task_loss": 0.5963894724845886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7416990399360657, + "epoch": 6.11, + "learning_rate": 6.351889868373385e-06, + "loss": 0.472, + "step": 7229, + "task_loss": 0.5903911590576172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4081224203109741, + "epoch": 6.11, + "learning_rate": 6.345851950247556e-06, + "loss": 0.5169, + "step": 7230, + "task_loss": 0.15100185573101044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5304721593856812, + "epoch": 6.11, + "learning_rate": 6.3398140321217245e-06, + "loss": 0.4036, + "step": 7231, + "task_loss": 0.48152437806129456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.339046835899353, + "epoch": 6.11, + "learning_rate": 6.333776113995894e-06, + "loss": 0.3833, + "step": 7232, + "task_loss": 0.14621078968048096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5772907137870789, + "epoch": 6.11, + "learning_rate": 6.327738195870064e-06, + "loss": 0.4169, + "step": 7233, + "task_loss": 0.6762048602104187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3395792543888092, + "epoch": 6.11, + "learning_rate": 6.3217002777442345e-06, + "loss": 0.4118, + "step": 7234, + "task_loss": 0.41521400213241577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4958908259868622, + "epoch": 6.12, + "learning_rate": 6.315662359618404e-06, + "loss": 0.5398, + "step": 7235, + "task_loss": 0.21414095163345337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4538428783416748, + "epoch": 6.12, + "learning_rate": 6.309624441492573e-06, + "loss": 0.342, + "step": 7236, + "task_loss": 0.7001739740371704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41967061161994934, + "epoch": 6.12, + "learning_rate": 6.303586523366743e-06, + "loss": 0.4931, + "step": 7237, + "task_loss": 0.8529934883117676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23234857618808746, + "epoch": 6.12, + "learning_rate": 6.297548605240913e-06, + "loss": 0.2655, + "step": 7238, + "task_loss": 0.148982971906662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47618067264556885, + "epoch": 6.12, + "learning_rate": 6.291510687115083e-06, + "loss": 0.4274, + "step": 7239, + "task_loss": 1.2428069114685059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5605955123901367, + "epoch": 6.12, + "learning_rate": 6.285472768989254e-06, + "loss": 0.4915, + "step": 7240, + "task_loss": 0.3070848882198334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32097023725509644, + "epoch": 6.12, + "learning_rate": 6.279434850863422e-06, + "loss": 0.3544, + "step": 7241, + "task_loss": 0.7405945062637329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26526740193367004, + "epoch": 6.12, + "learning_rate": 6.273396932737592e-06, + "loss": 0.4234, + "step": 7242, + "task_loss": 0.49552497267723083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28193455934524536, + "epoch": 6.12, + "learning_rate": 6.267359014611762e-06, + "loss": 0.3365, + "step": 7243, + "task_loss": 0.5540476441383362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5849704742431641, + "epoch": 6.12, + "learning_rate": 6.261321096485933e-06, + "loss": 0.4917, + "step": 7244, + "task_loss": 0.6016995310783386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6353238821029663, + "epoch": 6.12, + "learning_rate": 6.255283178360102e-06, + "loss": 0.5695, + "step": 7245, + "task_loss": 0.5593791604042053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48674285411834717, + "epoch": 6.13, + "learning_rate": 6.249245260234272e-06, + "loss": 0.437, + "step": 7246, + "task_loss": 0.8704448938369751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3887278437614441, + "epoch": 6.13, + "learning_rate": 6.243207342108441e-06, + "loss": 0.3945, + "step": 7247, + "task_loss": 1.036819577217102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7221405506134033, + "epoch": 6.13, + "learning_rate": 6.2371694239826115e-06, + "loss": 0.4011, + "step": 7248, + "task_loss": 0.8440427780151367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2905399799346924, + "epoch": 6.13, + "learning_rate": 6.23113150585678e-06, + "loss": 0.293, + "step": 7249, + "task_loss": 0.39458930492401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44837361574172974, + "epoch": 6.13, + "learning_rate": 6.225093587730951e-06, + "loss": 0.3776, + "step": 7250, + "task_loss": 0.8367217779159546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3761056661605835, + "epoch": 6.13, + "learning_rate": 6.219055669605121e-06, + "loss": 0.3713, + "step": 7251, + "task_loss": 0.9336838126182556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5167402625083923, + "epoch": 6.13, + "learning_rate": 6.21301775147929e-06, + "loss": 0.5295, + "step": 7252, + "task_loss": 0.24397388100624084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37550050020217896, + "epoch": 6.13, + "learning_rate": 6.20697983335346e-06, + "loss": 0.5021, + "step": 7253, + "task_loss": 0.22421199083328247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30869266390800476, + "epoch": 6.13, + "learning_rate": 6.20094191522763e-06, + "loss": 0.5231, + "step": 7254, + "task_loss": 0.8793156147003174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21807971596717834, + "epoch": 6.13, + "learning_rate": 6.1949039971017996e-06, + "loss": 0.3784, + "step": 7255, + "task_loss": 0.5410561561584473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27783331274986267, + "epoch": 6.13, + "learning_rate": 6.188866078975969e-06, + "loss": 0.3802, + "step": 7256, + "task_loss": 0.4019317328929901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.655258059501648, + "epoch": 6.13, + "learning_rate": 6.182828160850139e-06, + "loss": 0.356, + "step": 7257, + "task_loss": 0.8426075577735901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30961620807647705, + "epoch": 6.14, + "learning_rate": 6.176790242724309e-06, + "loss": 0.5551, + "step": 7258, + "task_loss": 0.8421933650970459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24080996215343475, + "epoch": 6.14, + "learning_rate": 6.1707523245984785e-06, + "loss": 0.3145, + "step": 7259, + "task_loss": 0.3258103132247925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6343517303466797, + "epoch": 6.14, + "learning_rate": 6.164714406472648e-06, + "loss": 0.6194, + "step": 7260, + "task_loss": 0.9431158900260925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43458953499794006, + "epoch": 6.14, + "learning_rate": 6.158676488346819e-06, + "loss": 0.3747, + "step": 7261, + "task_loss": 0.0729730948805809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22360976040363312, + "epoch": 6.14, + "learning_rate": 6.152638570220988e-06, + "loss": 0.3519, + "step": 7262, + "task_loss": 0.018381869420409203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30725303292274475, + "epoch": 6.14, + "learning_rate": 6.146600652095158e-06, + "loss": 0.4896, + "step": 7263, + "task_loss": 0.7540176510810852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4743155539035797, + "epoch": 6.14, + "learning_rate": 6.140562733969327e-06, + "loss": 0.5252, + "step": 7264, + "task_loss": 2.6361896991729736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40360334515571594, + "epoch": 6.14, + "learning_rate": 6.134524815843498e-06, + "loss": 0.5201, + "step": 7265, + "task_loss": 1.1085761785507202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7494778633117676, + "epoch": 6.14, + "learning_rate": 6.128486897717667e-06, + "loss": 0.7189, + "step": 7266, + "task_loss": 0.6918783783912659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34853583574295044, + "epoch": 6.14, + "learning_rate": 6.122448979591837e-06, + "loss": 0.3056, + "step": 7267, + "task_loss": 1.0667970180511475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33720457553863525, + "epoch": 6.14, + "learning_rate": 6.116411061466007e-06, + "loss": 0.3559, + "step": 7268, + "task_loss": 0.8019275069236755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38479551672935486, + "epoch": 6.14, + "learning_rate": 6.1103731433401765e-06, + "loss": 0.4182, + "step": 7269, + "task_loss": 0.2925267219543457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46497076749801636, + "epoch": 6.15, + "learning_rate": 6.104335225214346e-06, + "loss": 0.5108, + "step": 7270, + "task_loss": 0.0890534445643425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7163013219833374, + "epoch": 6.15, + "learning_rate": 6.098297307088517e-06, + "loss": 0.5152, + "step": 7271, + "task_loss": 0.5404446721076965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24514922499656677, + "epoch": 6.15, + "learning_rate": 6.092259388962686e-06, + "loss": 0.304, + "step": 7272, + "task_loss": 0.1903451383113861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6874876022338867, + "epoch": 6.15, + "learning_rate": 6.086221470836856e-06, + "loss": 0.4389, + "step": 7273, + "task_loss": 0.46717455983161926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36466658115386963, + "epoch": 6.15, + "learning_rate": 6.080183552711025e-06, + "loss": 0.4183, + "step": 7274, + "task_loss": 0.7481908202171326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38540419936180115, + "epoch": 6.15, + "learning_rate": 6.074145634585196e-06, + "loss": 0.507, + "step": 7275, + "task_loss": 0.8377128839492798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.607723593711853, + "epoch": 6.15, + "learning_rate": 6.0681077164593655e-06, + "loss": 0.5996, + "step": 7276, + "task_loss": 1.2768973112106323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4994179606437683, + "epoch": 6.15, + "learning_rate": 6.062069798333535e-06, + "loss": 0.447, + "step": 7277, + "task_loss": 0.4322272539138794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40883272886276245, + "epoch": 6.15, + "learning_rate": 6.056031880207705e-06, + "loss": 0.3761, + "step": 7278, + "task_loss": 0.07739400863647461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39878591895103455, + "epoch": 6.15, + "learning_rate": 6.049993962081875e-06, + "loss": 0.4188, + "step": 7279, + "task_loss": 0.5073037147521973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3509463667869568, + "epoch": 6.15, + "learning_rate": 6.043956043956044e-06, + "loss": 0.3611, + "step": 7280, + "task_loss": 0.7188200354576111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.436120867729187, + "epoch": 6.15, + "learning_rate": 6.037918125830214e-06, + "loss": 0.441, + "step": 7281, + "task_loss": 0.7510096430778503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49622464179992676, + "epoch": 6.16, + "learning_rate": 6.031880207704384e-06, + "loss": 0.4155, + "step": 7282, + "task_loss": 0.43904760479927063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39170512557029724, + "epoch": 6.16, + "learning_rate": 6.0258422895785535e-06, + "loss": 0.5618, + "step": 7283, + "task_loss": 0.6548123955726624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3486229181289673, + "epoch": 6.16, + "learning_rate": 6.019804371452723e-06, + "loss": 0.3256, + "step": 7284, + "task_loss": 0.7142788767814636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2012559324502945, + "epoch": 6.16, + "learning_rate": 6.013766453326893e-06, + "loss": 0.389, + "step": 7285, + "task_loss": 0.2915891706943512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.316392183303833, + "epoch": 6.16, + "learning_rate": 6.0077285352010635e-06, + "loss": 0.4925, + "step": 7286, + "task_loss": 0.4840840697288513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3201707899570465, + "epoch": 6.16, + "learning_rate": 6.0016906170752324e-06, + "loss": 0.4569, + "step": 7287, + "task_loss": 0.10829141736030579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4134579002857208, + "epoch": 6.16, + "learning_rate": 5.995652698949403e-06, + "loss": 0.4123, + "step": 7288, + "task_loss": 0.3835233449935913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4540594518184662, + "epoch": 6.16, + "learning_rate": 5.989614780823572e-06, + "loss": 0.5934, + "step": 7289, + "task_loss": 1.351678729057312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.385326623916626, + "epoch": 6.16, + "learning_rate": 5.9835768626977424e-06, + "loss": 0.3144, + "step": 7290, + "task_loss": 0.5265473127365112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45922553539276123, + "epoch": 6.16, + "learning_rate": 5.977538944571911e-06, + "loss": 0.4517, + "step": 7291, + "task_loss": 0.29739147424697876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3387126922607422, + "epoch": 6.16, + "learning_rate": 5.971501026446082e-06, + "loss": 0.4115, + "step": 7292, + "task_loss": 0.6748889088630676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6860440373420715, + "epoch": 6.16, + "learning_rate": 5.965463108320252e-06, + "loss": 0.4421, + "step": 7293, + "task_loss": 0.7484753727912903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23819462954998016, + "epoch": 6.17, + "learning_rate": 5.959425190194421e-06, + "loss": 0.3806, + "step": 7294, + "task_loss": 0.542414665222168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2464994490146637, + "epoch": 6.17, + "learning_rate": 5.953387272068591e-06, + "loss": 0.371, + "step": 7295, + "task_loss": 0.10484969615936279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5479385852813721, + "epoch": 6.17, + "learning_rate": 5.947349353942761e-06, + "loss": 0.6044, + "step": 7296, + "task_loss": 0.6484646201133728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7702033519744873, + "epoch": 6.17, + "learning_rate": 5.9413114358169305e-06, + "loss": 0.5421, + "step": 7297, + "task_loss": 0.3023211658000946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5532692670822144, + "epoch": 6.17, + "learning_rate": 5.9352735176911e-06, + "loss": 0.3602, + "step": 7298, + "task_loss": 0.6200825572013855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32802143692970276, + "epoch": 6.17, + "learning_rate": 5.92923559956527e-06, + "loss": 0.3859, + "step": 7299, + "task_loss": 0.6853464245796204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19541683793067932, + "epoch": 6.17, + "learning_rate": 5.92319768143944e-06, + "loss": 0.3515, + "step": 7300, + "task_loss": 0.5572487115859985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49718689918518066, + "epoch": 6.17, + "learning_rate": 5.917159763313609e-06, + "loss": 0.3829, + "step": 7301, + "task_loss": 0.9976769089698792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20859295129776, + "epoch": 6.17, + "learning_rate": 5.911121845187779e-06, + "loss": 0.3031, + "step": 7302, + "task_loss": 0.20879845321178436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3193260729312897, + "epoch": 6.17, + "learning_rate": 5.90508392706195e-06, + "loss": 0.5351, + "step": 7303, + "task_loss": 0.32081133127212524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34988072514533997, + "epoch": 6.17, + "learning_rate": 5.899046008936119e-06, + "loss": 0.3856, + "step": 7304, + "task_loss": 0.47435665130615234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5564247965812683, + "epoch": 6.17, + "learning_rate": 5.893008090810289e-06, + "loss": 0.5241, + "step": 7305, + "task_loss": 1.6786940097808838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17770510911941528, + "epoch": 6.18, + "learning_rate": 5.886970172684458e-06, + "loss": 0.3584, + "step": 7306, + "task_loss": 0.0852338969707489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5578165054321289, + "epoch": 6.18, + "learning_rate": 5.880932254558629e-06, + "loss": 0.4564, + "step": 7307, + "task_loss": 0.4016437232494354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6362099647521973, + "epoch": 6.18, + "learning_rate": 5.874894336432798e-06, + "loss": 0.4208, + "step": 7308, + "task_loss": 1.5623605251312256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5555299520492554, + "epoch": 6.18, + "learning_rate": 5.868856418306968e-06, + "loss": 0.482, + "step": 7309, + "task_loss": 1.108620285987854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24304023385047913, + "epoch": 6.18, + "learning_rate": 5.862818500181138e-06, + "loss": 0.3736, + "step": 7310, + "task_loss": 0.15987583994865417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6921083927154541, + "epoch": 6.18, + "learning_rate": 5.8567805820553075e-06, + "loss": 0.6145, + "step": 7311, + "task_loss": 0.6368083953857422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3926209807395935, + "epoch": 6.18, + "learning_rate": 5.850742663929477e-06, + "loss": 0.463, + "step": 7312, + "task_loss": 0.3289099633693695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3738211691379547, + "epoch": 6.18, + "learning_rate": 5.844704745803648e-06, + "loss": 0.4137, + "step": 7313, + "task_loss": 0.34171897172927856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3224182426929474, + "epoch": 6.18, + "learning_rate": 5.838666827677817e-06, + "loss": 0.4432, + "step": 7314, + "task_loss": 0.2815491259098053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3088608384132385, + "epoch": 6.18, + "learning_rate": 5.832628909551987e-06, + "loss": 0.3704, + "step": 7315, + "task_loss": 0.6262466311454773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34182438254356384, + "epoch": 6.18, + "learning_rate": 5.826590991426156e-06, + "loss": 0.5061, + "step": 7316, + "task_loss": 0.377057284116745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5689165592193604, + "epoch": 6.19, + "learning_rate": 5.820553073300327e-06, + "loss": 0.4447, + "step": 7317, + "task_loss": 0.24629873037338257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2638874650001526, + "epoch": 6.19, + "learning_rate": 5.814515155174496e-06, + "loss": 0.3839, + "step": 7318, + "task_loss": 0.6169545650482178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.368215948343277, + "epoch": 6.19, + "learning_rate": 5.808477237048666e-06, + "loss": 0.3961, + "step": 7319, + "task_loss": 0.7739343047142029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43293672800064087, + "epoch": 6.19, + "learning_rate": 5.802439318922836e-06, + "loss": 0.4008, + "step": 7320, + "task_loss": 1.3913404941558838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45702236890792847, + "epoch": 6.19, + "learning_rate": 5.796401400797006e-06, + "loss": 0.5692, + "step": 7321, + "task_loss": 1.6856454610824585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24414291977882385, + "epoch": 6.19, + "learning_rate": 5.790363482671175e-06, + "loss": 0.3822, + "step": 7322, + "task_loss": 0.7323386073112488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3668941855430603, + "epoch": 6.19, + "learning_rate": 5.784325564545345e-06, + "loss": 0.4266, + "step": 7323, + "task_loss": 0.5207235813140869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38056740164756775, + "epoch": 6.19, + "learning_rate": 5.778287646419515e-06, + "loss": 0.447, + "step": 7324, + "task_loss": 0.2720244824886322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4575008153915405, + "epoch": 6.19, + "learning_rate": 5.7722497282936845e-06, + "loss": 0.4409, + "step": 7325, + "task_loss": 0.8203812837600708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22949141263961792, + "epoch": 6.19, + "learning_rate": 5.766211810167854e-06, + "loss": 0.4594, + "step": 7326, + "task_loss": 0.1782011240720749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27222833037376404, + "epoch": 6.19, + "learning_rate": 5.760173892042024e-06, + "loss": 0.4155, + "step": 7327, + "task_loss": 0.15903620421886444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24252432584762573, + "epoch": 6.19, + "learning_rate": 5.7541359739161945e-06, + "loss": 0.394, + "step": 7328, + "task_loss": 0.5022997856140137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21282315254211426, + "epoch": 6.2, + "learning_rate": 5.748098055790363e-06, + "loss": 0.4268, + "step": 7329, + "task_loss": 0.7104138731956482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4601905643939972, + "epoch": 6.2, + "learning_rate": 5.742060137664534e-06, + "loss": 0.3699, + "step": 7330, + "task_loss": 0.3149878978729248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28273850679397583, + "epoch": 6.2, + "learning_rate": 5.736022219538703e-06, + "loss": 0.3864, + "step": 7331, + "task_loss": 0.39543840289115906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7686933279037476, + "epoch": 6.2, + "learning_rate": 5.729984301412873e-06, + "loss": 0.5822, + "step": 7332, + "task_loss": 0.7132221460342407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33279579877853394, + "epoch": 6.2, + "learning_rate": 5.723946383287043e-06, + "loss": 0.3279, + "step": 7333, + "task_loss": 0.8187244534492493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36706727743148804, + "epoch": 6.2, + "learning_rate": 5.717908465161213e-06, + "loss": 0.3978, + "step": 7334, + "task_loss": 0.58487868309021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3357492685317993, + "epoch": 6.2, + "learning_rate": 5.7118705470353826e-06, + "loss": 0.3527, + "step": 7335, + "task_loss": 0.7118744254112244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2914299964904785, + "epoch": 6.2, + "learning_rate": 5.705832628909552e-06, + "loss": 0.3204, + "step": 7336, + "task_loss": 0.05488895624876022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3607632517814636, + "epoch": 6.2, + "learning_rate": 5.699794710783722e-06, + "loss": 0.3934, + "step": 7337, + "task_loss": 0.3483980894088745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5294084548950195, + "epoch": 6.2, + "learning_rate": 5.693756792657892e-06, + "loss": 0.4286, + "step": 7338, + "task_loss": 1.129359483718872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.18070408701896667, + "epoch": 6.2, + "learning_rate": 5.6877188745320615e-06, + "loss": 0.3092, + "step": 7339, + "task_loss": 0.5854549407958984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22348564863204956, + "epoch": 6.2, + "learning_rate": 5.681680956406232e-06, + "loss": 0.3731, + "step": 7340, + "task_loss": 0.2520732879638672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3547956347465515, + "epoch": 6.21, + "learning_rate": 5.675643038280401e-06, + "loss": 0.4684, + "step": 7341, + "task_loss": 0.5506937503814697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4159926772117615, + "epoch": 6.21, + "learning_rate": 5.6696051201545715e-06, + "loss": 0.3774, + "step": 7342, + "task_loss": 0.7520624399185181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2730810344219208, + "epoch": 6.21, + "learning_rate": 5.66356720202874e-06, + "loss": 0.3755, + "step": 7343, + "task_loss": 0.18272829055786133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5668665170669556, + "epoch": 6.21, + "learning_rate": 5.65752928390291e-06, + "loss": 0.5108, + "step": 7344, + "task_loss": 0.35754671692848206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.619672417640686, + "epoch": 6.21, + "learning_rate": 5.651491365777081e-06, + "loss": 0.5321, + "step": 7345, + "task_loss": 0.7355321645736694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3953667879104614, + "epoch": 6.21, + "learning_rate": 5.6454534476512495e-06, + "loss": 0.4635, + "step": 7346, + "task_loss": 0.4438496530056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3912051022052765, + "epoch": 6.21, + "learning_rate": 5.63941552952542e-06, + "loss": 0.3678, + "step": 7347, + "task_loss": 0.6003245115280151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5242830514907837, + "epoch": 6.21, + "learning_rate": 5.633377611399589e-06, + "loss": 0.5184, + "step": 7348, + "task_loss": 0.28880128264427185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6186342239379883, + "epoch": 6.21, + "learning_rate": 5.6273396932737596e-06, + "loss": 0.4431, + "step": 7349, + "task_loss": 1.725752830505371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2898389399051666, + "epoch": 6.21, + "learning_rate": 5.621301775147929e-06, + "loss": 0.3581, + "step": 7350, + "task_loss": 0.2267938256263733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35627055168151855, + "epoch": 6.21, + "learning_rate": 5.615263857022099e-06, + "loss": 0.4487, + "step": 7351, + "task_loss": 1.0505273342132568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8235265016555786, + "epoch": 6.21, + "learning_rate": 5.609225938896269e-06, + "loss": 0.5961, + "step": 7352, + "task_loss": 0.9141087532043457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6401113867759705, + "epoch": 6.22, + "learning_rate": 5.6031880207704385e-06, + "loss": 0.5655, + "step": 7353, + "task_loss": 1.0912744998931885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7079514265060425, + "epoch": 6.22, + "learning_rate": 5.597150102644608e-06, + "loss": 0.5564, + "step": 7354, + "task_loss": 0.8809816837310791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6462124586105347, + "epoch": 6.22, + "learning_rate": 5.591112184518779e-06, + "loss": 0.4641, + "step": 7355, + "task_loss": 0.7900420427322388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3693937361240387, + "epoch": 6.22, + "learning_rate": 5.585074266392948e-06, + "loss": 0.431, + "step": 7356, + "task_loss": 0.687492311000824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3564663529396057, + "epoch": 6.22, + "learning_rate": 5.579036348267118e-06, + "loss": 0.3549, + "step": 7357, + "task_loss": 0.46581822633743286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3641672730445862, + "epoch": 6.22, + "learning_rate": 5.572998430141287e-06, + "loss": 0.3712, + "step": 7358, + "task_loss": 0.41328558325767517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5339758396148682, + "epoch": 6.22, + "learning_rate": 5.566960512015458e-06, + "loss": 0.427, + "step": 7359, + "task_loss": 0.92621248960495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4629404544830322, + "epoch": 6.22, + "learning_rate": 5.560922593889627e-06, + "loss": 0.3534, + "step": 7360, + "task_loss": 0.6260806322097778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5174963474273682, + "epoch": 6.22, + "learning_rate": 5.554884675763797e-06, + "loss": 0.4567, + "step": 7361, + "task_loss": 0.716735303401947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23456385731697083, + "epoch": 6.22, + "learning_rate": 5.548846757637967e-06, + "loss": 0.331, + "step": 7362, + "task_loss": 0.4039887487888336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23929733037948608, + "epoch": 6.22, + "learning_rate": 5.5428088395121365e-06, + "loss": 0.3706, + "step": 7363, + "task_loss": 0.5053170323371887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6820805668830872, + "epoch": 6.22, + "learning_rate": 5.536770921386306e-06, + "loss": 0.4868, + "step": 7364, + "task_loss": 0.9453413486480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6904684901237488, + "epoch": 6.23, + "learning_rate": 5.530733003260476e-06, + "loss": 0.5795, + "step": 7365, + "task_loss": 0.8088664412498474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1944456696510315, + "epoch": 6.23, + "learning_rate": 5.524695085134646e-06, + "loss": 0.4499, + "step": 7366, + "task_loss": 0.1197943240404129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3873312473297119, + "epoch": 6.23, + "learning_rate": 5.5186571670088154e-06, + "loss": 0.365, + "step": 7367, + "task_loss": 0.5512471199035645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2314416915178299, + "epoch": 6.23, + "learning_rate": 5.512619248882985e-06, + "loss": 0.3929, + "step": 7368, + "task_loss": 0.29062992334365845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33906030654907227, + "epoch": 6.23, + "learning_rate": 5.506581330757155e-06, + "loss": 0.4155, + "step": 7369, + "task_loss": 0.5000306963920593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4262440800666809, + "epoch": 6.23, + "learning_rate": 5.5005434126313255e-06, + "loss": 0.339, + "step": 7370, + "task_loss": 0.3971487283706665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46672385931015015, + "epoch": 6.23, + "learning_rate": 5.494505494505494e-06, + "loss": 0.523, + "step": 7371, + "task_loss": 0.6195513606071472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41375917196273804, + "epoch": 6.23, + "learning_rate": 5.488467576379665e-06, + "loss": 0.4771, + "step": 7372, + "task_loss": 0.30415114760398865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8488360643386841, + "epoch": 6.23, + "learning_rate": 5.482429658253834e-06, + "loss": 0.5253, + "step": 7373, + "task_loss": 1.3912296295166016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4554328918457031, + "epoch": 6.23, + "learning_rate": 5.476391740128004e-06, + "loss": 0.4001, + "step": 7374, + "task_loss": 0.4204336404800415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6580468416213989, + "epoch": 6.23, + "learning_rate": 5.470353822002174e-06, + "loss": 0.4416, + "step": 7375, + "task_loss": 1.1220093965530396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3626665472984314, + "epoch": 6.23, + "learning_rate": 5.464315903876344e-06, + "loss": 0.4647, + "step": 7376, + "task_loss": 1.1311931610107422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4704763889312744, + "epoch": 6.24, + "learning_rate": 5.4582779857505135e-06, + "loss": 0.4677, + "step": 7377, + "task_loss": 0.6023044586181641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23187902569770813, + "epoch": 6.24, + "learning_rate": 5.452240067624683e-06, + "loss": 0.6061, + "step": 7378, + "task_loss": 0.9020999670028687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3436601758003235, + "epoch": 6.24, + "learning_rate": 5.446202149498853e-06, + "loss": 0.3869, + "step": 7379, + "task_loss": 0.4765997529029846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32063955068588257, + "epoch": 6.24, + "learning_rate": 5.440164231373023e-06, + "loss": 0.4078, + "step": 7380, + "task_loss": 0.6190409064292908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3719885051250458, + "epoch": 6.24, + "learning_rate": 5.434126313247192e-06, + "loss": 0.3994, + "step": 7381, + "task_loss": 0.9592141509056091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48282116651535034, + "epoch": 6.24, + "learning_rate": 5.428088395121363e-06, + "loss": 0.4522, + "step": 7382, + "task_loss": 0.3770902752876282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3770049512386322, + "epoch": 6.24, + "learning_rate": 5.422050476995532e-06, + "loss": 0.5106, + "step": 7383, + "task_loss": 1.098600149154663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32676392793655396, + "epoch": 6.24, + "learning_rate": 5.4160125588697024e-06, + "loss": 0.4064, + "step": 7384, + "task_loss": 0.08062362670898438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5843770503997803, + "epoch": 6.24, + "learning_rate": 5.409974640743871e-06, + "loss": 0.4086, + "step": 7385, + "task_loss": 0.6854328513145447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44204598665237427, + "epoch": 6.24, + "learning_rate": 5.403936722618042e-06, + "loss": 0.3637, + "step": 7386, + "task_loss": 1.2379357814788818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3880566358566284, + "epoch": 6.24, + "learning_rate": 5.397898804492212e-06, + "loss": 0.4337, + "step": 7387, + "task_loss": 1.2023465633392334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5054335594177246, + "epoch": 6.24, + "learning_rate": 5.391860886366381e-06, + "loss": 0.5176, + "step": 7388, + "task_loss": 0.19818831980228424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3425913453102112, + "epoch": 6.25, + "learning_rate": 5.385822968240551e-06, + "loss": 0.3719, + "step": 7389, + "task_loss": 0.4872191250324249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7701621055603027, + "epoch": 6.25, + "learning_rate": 5.379785050114721e-06, + "loss": 0.495, + "step": 7390, + "task_loss": 0.26044097542762756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40795350074768066, + "epoch": 6.25, + "learning_rate": 5.3737471319888905e-06, + "loss": 0.4239, + "step": 7391, + "task_loss": 0.5950875282287598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5454784631729126, + "epoch": 6.25, + "learning_rate": 5.36770921386306e-06, + "loss": 0.3849, + "step": 7392, + "task_loss": 0.8058868050575256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6760454773902893, + "epoch": 6.25, + "learning_rate": 5.36167129573723e-06, + "loss": 0.548, + "step": 7393, + "task_loss": 1.0544650554656982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22901886701583862, + "epoch": 6.25, + "learning_rate": 5.3556333776114e-06, + "loss": 0.3495, + "step": 7394, + "task_loss": 0.5884392261505127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4032093584537506, + "epoch": 6.25, + "learning_rate": 5.349595459485569e-06, + "loss": 0.3375, + "step": 7395, + "task_loss": 0.5870912075042725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31143710017204285, + "epoch": 6.25, + "learning_rate": 5.343557541359739e-06, + "loss": 0.3784, + "step": 7396, + "task_loss": 0.1689995974302292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5785366296768188, + "epoch": 6.25, + "learning_rate": 5.33751962323391e-06, + "loss": 0.3605, + "step": 7397, + "task_loss": 0.9896876811981201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5109443664550781, + "epoch": 6.25, + "learning_rate": 5.3314817051080786e-06, + "loss": 0.4661, + "step": 7398, + "task_loss": 0.04805897921323776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6356993317604065, + "epoch": 6.25, + "learning_rate": 5.325443786982249e-06, + "loss": 0.4749, + "step": 7399, + "task_loss": 1.2756054401397705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2623715102672577, + "epoch": 6.26, + "learning_rate": 5.319405868856418e-06, + "loss": 0.3826, + "step": 7400, + "task_loss": 0.6776277422904968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5783810615539551, + "epoch": 6.26, + "learning_rate": 5.313367950730589e-06, + "loss": 0.4556, + "step": 7401, + "task_loss": 0.336563378572464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.356137752532959, + "epoch": 6.26, + "learning_rate": 5.307330032604758e-06, + "loss": 0.3666, + "step": 7402, + "task_loss": 0.3089883029460907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.16587617993354797, + "epoch": 6.26, + "learning_rate": 5.301292114478928e-06, + "loss": 0.25, + "step": 7403, + "task_loss": 0.369858056306839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6136224269866943, + "epoch": 6.26, + "learning_rate": 5.295254196353098e-06, + "loss": 0.4745, + "step": 7404, + "task_loss": 0.1862764209508896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.336365669965744, + "epoch": 6.26, + "learning_rate": 5.2892162782272675e-06, + "loss": 0.4337, + "step": 7405, + "task_loss": 0.5411155223846436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4495391249656677, + "epoch": 6.26, + "learning_rate": 5.283178360101437e-06, + "loss": 0.4121, + "step": 7406, + "task_loss": 0.7356066703796387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31159916520118713, + "epoch": 6.26, + "learning_rate": 5.277140441975608e-06, + "loss": 0.3717, + "step": 7407, + "task_loss": 0.44125157594680786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40781551599502563, + "epoch": 6.26, + "learning_rate": 5.271102523849777e-06, + "loss": 0.4688, + "step": 7408, + "task_loss": 0.38573363423347473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.556952714920044, + "epoch": 6.26, + "learning_rate": 5.265064605723946e-06, + "loss": 0.4247, + "step": 7409, + "task_loss": 0.6565746665000916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7109231948852539, + "epoch": 6.26, + "learning_rate": 5.259026687598116e-06, + "loss": 0.6595, + "step": 7410, + "task_loss": 0.9191327691078186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26015159487724304, + "epoch": 6.26, + "learning_rate": 5.252988769472286e-06, + "loss": 0.353, + "step": 7411, + "task_loss": 0.4643987715244293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2671029567718506, + "epoch": 6.27, + "learning_rate": 5.246950851346456e-06, + "loss": 0.3617, + "step": 7412, + "task_loss": 0.6312130093574524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5368250012397766, + "epoch": 6.27, + "learning_rate": 5.240912933220625e-06, + "loss": 0.6017, + "step": 7413, + "task_loss": 1.2658514976501465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2623820900917053, + "epoch": 6.27, + "learning_rate": 5.234875015094796e-06, + "loss": 0.3661, + "step": 7414, + "task_loss": 0.4086884558200836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38856595754623413, + "epoch": 6.27, + "learning_rate": 5.228837096968965e-06, + "loss": 0.3919, + "step": 7415, + "task_loss": 0.31083202362060547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5060527920722961, + "epoch": 6.27, + "learning_rate": 5.222799178843135e-06, + "loss": 0.6069, + "step": 7416, + "task_loss": 1.4717689752578735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42010295391082764, + "epoch": 6.27, + "learning_rate": 5.216761260717305e-06, + "loss": 0.4907, + "step": 7417, + "task_loss": 0.5572549700737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4489558935165405, + "epoch": 6.27, + "learning_rate": 5.210723342591475e-06, + "loss": 0.38, + "step": 7418, + "task_loss": 0.1304665505886078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30602511763572693, + "epoch": 6.27, + "learning_rate": 5.2046854244656445e-06, + "loss": 0.4126, + "step": 7419, + "task_loss": 1.026025652885437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6032537221908569, + "epoch": 6.27, + "learning_rate": 5.198647506339814e-06, + "loss": 0.5059, + "step": 7420, + "task_loss": 1.0757315158843994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2799069881439209, + "epoch": 6.27, + "learning_rate": 5.192609588213984e-06, + "loss": 0.4706, + "step": 7421, + "task_loss": 0.8361231684684753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.18934589624404907, + "epoch": 6.27, + "learning_rate": 5.1865716700881545e-06, + "loss": 0.3557, + "step": 7422, + "task_loss": 0.09834219515323639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35489460825920105, + "epoch": 6.27, + "learning_rate": 5.180533751962323e-06, + "loss": 0.2697, + "step": 7423, + "task_loss": 0.2501721978187561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45717719197273254, + "epoch": 6.28, + "learning_rate": 5.174495833836494e-06, + "loss": 0.3825, + "step": 7424, + "task_loss": 0.4972097873687744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43969860672950745, + "epoch": 6.28, + "learning_rate": 5.168457915710663e-06, + "loss": 0.4208, + "step": 7425, + "task_loss": 1.141324520111084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2302996814250946, + "epoch": 6.28, + "learning_rate": 5.162419997584833e-06, + "loss": 0.32, + "step": 7426, + "task_loss": 0.3124285340309143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3646153509616852, + "epoch": 6.28, + "learning_rate": 5.156382079459002e-06, + "loss": 0.3812, + "step": 7427, + "task_loss": 0.6510748863220215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21820974349975586, + "epoch": 6.28, + "learning_rate": 5.150344161333173e-06, + "loss": 0.3455, + "step": 7428, + "task_loss": 0.30267438292503357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34554946422576904, + "epoch": 6.28, + "learning_rate": 5.1443062432073426e-06, + "loss": 0.3494, + "step": 7429, + "task_loss": 0.1413135528564453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1820530742406845, + "epoch": 6.28, + "learning_rate": 5.138268325081512e-06, + "loss": 0.3375, + "step": 7430, + "task_loss": 0.3372766375541687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6225518584251404, + "epoch": 6.28, + "learning_rate": 5.132230406955682e-06, + "loss": 0.493, + "step": 7431, + "task_loss": 0.4200439155101776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5159047842025757, + "epoch": 6.28, + "learning_rate": 5.126192488829852e-06, + "loss": 0.4264, + "step": 7432, + "task_loss": 0.4203190505504608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1653871089220047, + "epoch": 6.28, + "learning_rate": 5.1201545707040215e-06, + "loss": 0.3187, + "step": 7433, + "task_loss": 0.36211875081062317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.659524142742157, + "epoch": 6.28, + "learning_rate": 5.114116652578191e-06, + "loss": 0.5061, + "step": 7434, + "task_loss": 0.1781531274318695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2806876599788666, + "epoch": 6.28, + "learning_rate": 5.108078734452361e-06, + "loss": 0.4059, + "step": 7435, + "task_loss": 0.06963673233985901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2963976263999939, + "epoch": 6.29, + "learning_rate": 5.102040816326531e-06, + "loss": 0.3719, + "step": 7436, + "task_loss": 0.021247699856758118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4363195300102234, + "epoch": 6.29, + "learning_rate": 5.0960028982007e-06, + "loss": 0.4445, + "step": 7437, + "task_loss": 0.4809010624885559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8268147706985474, + "epoch": 6.29, + "learning_rate": 5.08996498007487e-06, + "loss": 0.646, + "step": 7438, + "task_loss": 0.3905881941318512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6067918539047241, + "epoch": 6.29, + "learning_rate": 5.083927061949041e-06, + "loss": 0.4475, + "step": 7439, + "task_loss": 0.2572454810142517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4973216652870178, + "epoch": 6.29, + "learning_rate": 5.0778891438232095e-06, + "loss": 0.3856, + "step": 7440, + "task_loss": 1.0486277341842651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35328763723373413, + "epoch": 6.29, + "learning_rate": 5.07185122569738e-06, + "loss": 0.4463, + "step": 7441, + "task_loss": 0.6866239905357361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3136610686779022, + "epoch": 6.29, + "learning_rate": 5.065813307571549e-06, + "loss": 0.3996, + "step": 7442, + "task_loss": 1.1094201803207397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5310568809509277, + "epoch": 6.29, + "learning_rate": 5.0597753894457195e-06, + "loss": 0.4816, + "step": 7443, + "task_loss": 0.9902428388595581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47853708267211914, + "epoch": 6.29, + "learning_rate": 5.053737471319889e-06, + "loss": 0.4335, + "step": 7444, + "task_loss": 0.041457246989011765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3184778094291687, + "epoch": 6.29, + "learning_rate": 5.047699553194059e-06, + "loss": 0.5052, + "step": 7445, + "task_loss": 0.24103352427482605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2613599896430969, + "epoch": 6.29, + "learning_rate": 5.041661635068229e-06, + "loss": 0.2967, + "step": 7446, + "task_loss": 0.11688145995140076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29496461153030396, + "epoch": 6.29, + "learning_rate": 5.0356237169423984e-06, + "loss": 0.4397, + "step": 7447, + "task_loss": 0.34415164589881897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7145918011665344, + "epoch": 6.3, + "learning_rate": 5.029585798816568e-06, + "loss": 0.5131, + "step": 7448, + "task_loss": 0.9403545260429382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5072611570358276, + "epoch": 6.3, + "learning_rate": 5.023547880690739e-06, + "loss": 0.4163, + "step": 7449, + "task_loss": 0.8284904360771179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3969079554080963, + "epoch": 6.3, + "learning_rate": 5.017509962564908e-06, + "loss": 0.4522, + "step": 7450, + "task_loss": 0.8418602347373962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49214908480644226, + "epoch": 6.3, + "learning_rate": 5.011472044439078e-06, + "loss": 0.4181, + "step": 7451, + "task_loss": 0.8271406292915344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45256391167640686, + "epoch": 6.3, + "learning_rate": 5.005434126313247e-06, + "loss": 0.4629, + "step": 7452, + "task_loss": 0.051237862557172775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4098917841911316, + "epoch": 6.3, + "learning_rate": 4.999396208187418e-06, + "loss": 0.4457, + "step": 7453, + "task_loss": 0.36419373750686646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43007057905197144, + "epoch": 6.3, + "learning_rate": 4.993358290061587e-06, + "loss": 0.455, + "step": 7454, + "task_loss": 0.7757449150085449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5634856224060059, + "epoch": 6.3, + "learning_rate": 4.987320371935757e-06, + "loss": 0.4573, + "step": 7455, + "task_loss": 1.137110948562622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32444557547569275, + "epoch": 6.3, + "learning_rate": 4.981282453809927e-06, + "loss": 0.366, + "step": 7456, + "task_loss": 0.2734954059123993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.314064621925354, + "epoch": 6.3, + "learning_rate": 4.9752445356840965e-06, + "loss": 0.3964, + "step": 7457, + "task_loss": 1.006845474243164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36465707421302795, + "epoch": 6.3, + "learning_rate": 4.969206617558266e-06, + "loss": 0.4777, + "step": 7458, + "task_loss": 0.6425678133964539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4745516777038574, + "epoch": 6.3, + "learning_rate": 4.963168699432436e-06, + "loss": 0.493, + "step": 7459, + "task_loss": 0.8187721371650696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38694560527801514, + "epoch": 6.31, + "learning_rate": 4.957130781306606e-06, + "loss": 0.3767, + "step": 7460, + "task_loss": 0.48382118344306946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3386395275592804, + "epoch": 6.31, + "learning_rate": 4.9510928631807754e-06, + "loss": 0.3588, + "step": 7461, + "task_loss": 0.7395762801170349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49550771713256836, + "epoch": 6.31, + "learning_rate": 4.945054945054945e-06, + "loss": 0.4234, + "step": 7462, + "task_loss": 0.3755562901496887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20921988785266876, + "epoch": 6.31, + "learning_rate": 4.939017026929115e-06, + "loss": 0.5432, + "step": 7463, + "task_loss": 0.10890568792819977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2143513560295105, + "epoch": 6.31, + "learning_rate": 4.9329791088032854e-06, + "loss": 0.3109, + "step": 7464, + "task_loss": 0.787300705909729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32300275564193726, + "epoch": 6.31, + "learning_rate": 4.926941190677454e-06, + "loss": 0.4159, + "step": 7465, + "task_loss": 0.5513202548027039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.261161744594574, + "epoch": 6.31, + "learning_rate": 4.920903272551625e-06, + "loss": 0.5107, + "step": 7466, + "task_loss": 0.6095837950706482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28015199303627014, + "epoch": 6.31, + "learning_rate": 4.914865354425794e-06, + "loss": 0.3888, + "step": 7467, + "task_loss": 0.5899730324745178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4837396442890167, + "epoch": 6.31, + "learning_rate": 4.908827436299964e-06, + "loss": 0.4278, + "step": 7468, + "task_loss": 0.675345778465271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2367822527885437, + "epoch": 6.31, + "learning_rate": 4.902789518174133e-06, + "loss": 0.4057, + "step": 7469, + "task_loss": 0.14069326221942902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29746514558792114, + "epoch": 6.31, + "learning_rate": 4.896751600048304e-06, + "loss": 0.4894, + "step": 7470, + "task_loss": 0.21853554248809814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42051124572753906, + "epoch": 6.32, + "learning_rate": 4.8907136819224735e-06, + "loss": 0.4628, + "step": 7471, + "task_loss": 0.528052568435669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35436365008354187, + "epoch": 6.32, + "learning_rate": 4.884675763796643e-06, + "loss": 0.3303, + "step": 7472, + "task_loss": 0.4779060184955597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2609892785549164, + "epoch": 6.32, + "learning_rate": 4.878637845670813e-06, + "loss": 0.4379, + "step": 7473, + "task_loss": 0.723876953125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6016688346862793, + "epoch": 6.32, + "learning_rate": 4.872599927544983e-06, + "loss": 0.4576, + "step": 7474, + "task_loss": 0.7880899310112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3632658123970032, + "epoch": 6.32, + "learning_rate": 4.866562009419152e-06, + "loss": 0.6327, + "step": 7475, + "task_loss": 0.7446721792221069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7677070498466492, + "epoch": 6.32, + "learning_rate": 4.860524091293322e-06, + "loss": 0.4704, + "step": 7476, + "task_loss": 0.3463546633720398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38483577966690063, + "epoch": 6.32, + "learning_rate": 4.854486173167492e-06, + "loss": 0.3704, + "step": 7477, + "task_loss": 0.17069073021411896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3742620646953583, + "epoch": 6.32, + "learning_rate": 4.848448255041662e-06, + "loss": 0.3999, + "step": 7478, + "task_loss": 0.6305133700370789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6221630573272705, + "epoch": 6.32, + "learning_rate": 4.842410336915831e-06, + "loss": 0.3871, + "step": 7479, + "task_loss": 0.5501518249511719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27786779403686523, + "epoch": 6.32, + "learning_rate": 4.836372418790001e-06, + "loss": 0.3501, + "step": 7480, + "task_loss": 1.025245189666748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2813095450401306, + "epoch": 6.32, + "learning_rate": 4.830334500664172e-06, + "loss": 0.3635, + "step": 7481, + "task_loss": 0.374921053647995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2870813310146332, + "epoch": 6.32, + "learning_rate": 4.8242965825383405e-06, + "loss": 0.4124, + "step": 7482, + "task_loss": 0.5259470343589783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4657238721847534, + "epoch": 6.33, + "learning_rate": 4.818258664412511e-06, + "loss": 0.4062, + "step": 7483, + "task_loss": 0.9027307033538818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32433903217315674, + "epoch": 6.33, + "learning_rate": 4.81222074628668e-06, + "loss": 0.3874, + "step": 7484, + "task_loss": 0.7425508499145508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7295410633087158, + "epoch": 6.33, + "learning_rate": 4.8061828281608505e-06, + "loss": 0.4008, + "step": 7485, + "task_loss": 0.6107948422431946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2529045343399048, + "epoch": 6.33, + "learning_rate": 4.80014491003502e-06, + "loss": 0.3821, + "step": 7486, + "task_loss": 0.45456260442733765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45728346705436707, + "epoch": 6.33, + "learning_rate": 4.79410699190919e-06, + "loss": 0.4356, + "step": 7487, + "task_loss": 0.8311693072319031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23208969831466675, + "epoch": 6.33, + "learning_rate": 4.78806907378336e-06, + "loss": 0.3519, + "step": 7488, + "task_loss": 0.12503524124622345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2905499339103699, + "epoch": 6.33, + "learning_rate": 4.782031155657529e-06, + "loss": 0.3704, + "step": 7489, + "task_loss": 0.023636208847165108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47241345047950745, + "epoch": 6.33, + "learning_rate": 4.775993237531699e-06, + "loss": 0.5011, + "step": 7490, + "task_loss": 0.22911174595355988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28566616773605347, + "epoch": 6.33, + "learning_rate": 4.76995531940587e-06, + "loss": 0.479, + "step": 7491, + "task_loss": 0.42829686403274536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31720277667045593, + "epoch": 6.33, + "learning_rate": 4.7639174012800386e-06, + "loss": 0.4517, + "step": 7492, + "task_loss": 0.42388537526130676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3090691566467285, + "epoch": 6.33, + "learning_rate": 4.757879483154209e-06, + "loss": 0.4426, + "step": 7493, + "task_loss": 0.5193140506744385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4583055078983307, + "epoch": 6.33, + "learning_rate": 4.751841565028378e-06, + "loss": 0.3626, + "step": 7494, + "task_loss": 0.5420424342155457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2785313129425049, + "epoch": 6.34, + "learning_rate": 4.745803646902549e-06, + "loss": 0.4094, + "step": 7495, + "task_loss": 0.8193860650062561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41292303800582886, + "epoch": 6.34, + "learning_rate": 4.739765728776718e-06, + "loss": 0.3711, + "step": 7496, + "task_loss": 0.7001169919967651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4504171311855316, + "epoch": 6.34, + "learning_rate": 4.733727810650888e-06, + "loss": 0.4295, + "step": 7497, + "task_loss": 0.5550360083580017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3314458727836609, + "epoch": 6.34, + "learning_rate": 4.727689892525058e-06, + "loss": 0.4695, + "step": 7498, + "task_loss": 0.4536001682281494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5099450349807739, + "epoch": 6.34, + "learning_rate": 4.7216519743992275e-06, + "loss": 0.475, + "step": 7499, + "task_loss": 0.61962890625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48199573159217834, + "epoch": 6.34, + "learning_rate": 4.715614056273397e-06, + "loss": 0.4386, + "step": 7500, + "task_loss": 0.7066032290458679 + }, + { + "epoch": 6.34, + "eval_accuracy": 0.9112079207920792, + "eval_loss": 0.273820161819458, + "eval_runtime": 227.7817, + "eval_samples_per_second": 110.852, + "eval_steps_per_second": 0.869, + "step": 7500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3561236560344696, + "epoch": 6.34, + "learning_rate": 4.709576138147567e-06, + "loss": 0.322, + "step": 7501, + "task_loss": 1.0204592943191528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37394022941589355, + "epoch": 6.34, + "learning_rate": 4.703538220021737e-06, + "loss": 0.4309, + "step": 7502, + "task_loss": 0.9365481734275818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27706557512283325, + "epoch": 6.34, + "learning_rate": 4.697500301895906e-06, + "loss": 0.3313, + "step": 7503, + "task_loss": 0.7739271521568298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.16937577724456787, + "epoch": 6.34, + "learning_rate": 4.691462383770076e-06, + "loss": 0.3784, + "step": 7504, + "task_loss": 0.14141735434532166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.532852292060852, + "epoch": 6.34, + "learning_rate": 4.685424465644246e-06, + "loss": 0.439, + "step": 7505, + "task_loss": 0.26980021595954895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5000603795051575, + "epoch": 6.34, + "learning_rate": 4.679386547518416e-06, + "loss": 0.4344, + "step": 7506, + "task_loss": 0.7374952435493469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30799609422683716, + "epoch": 6.35, + "learning_rate": 4.673348629392585e-06, + "loss": 0.4124, + "step": 7507, + "task_loss": 0.09331848472356796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27996543049812317, + "epoch": 6.35, + "learning_rate": 4.667310711266756e-06, + "loss": 0.3709, + "step": 7508, + "task_loss": 0.9448261857032776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20915837585926056, + "epoch": 6.35, + "learning_rate": 4.661272793140925e-06, + "loss": 0.3654, + "step": 7509, + "task_loss": 0.1709122359752655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49053695797920227, + "epoch": 6.35, + "learning_rate": 4.655234875015095e-06, + "loss": 0.4347, + "step": 7510, + "task_loss": 0.6704981327056885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32792824506759644, + "epoch": 6.35, + "learning_rate": 4.649196956889264e-06, + "loss": 0.4019, + "step": 7511, + "task_loss": 0.8451147079467773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37072181701660156, + "epoch": 6.35, + "learning_rate": 4.643159038763435e-06, + "loss": 0.3631, + "step": 7512, + "task_loss": 0.15844447910785675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25223928689956665, + "epoch": 6.35, + "learning_rate": 4.6371211206376045e-06, + "loss": 0.3173, + "step": 7513, + "task_loss": 0.38770201802253723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47233253717422485, + "epoch": 6.35, + "learning_rate": 4.631083202511774e-06, + "loss": 0.4189, + "step": 7514, + "task_loss": 0.37192830443382263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2528648376464844, + "epoch": 6.35, + "learning_rate": 4.625045284385944e-06, + "loss": 0.3386, + "step": 7515, + "task_loss": 0.5089412927627563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.335831880569458, + "epoch": 6.35, + "learning_rate": 4.619007366260114e-06, + "loss": 0.3753, + "step": 7516, + "task_loss": 0.6398341655731201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33378735184669495, + "epoch": 6.35, + "learning_rate": 4.612969448134283e-06, + "loss": 0.3604, + "step": 7517, + "task_loss": 0.20500139892101288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3438367247581482, + "epoch": 6.35, + "learning_rate": 4.606931530008454e-06, + "loss": 0.4045, + "step": 7518, + "task_loss": 0.36127787828445435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37273192405700684, + "epoch": 6.36, + "learning_rate": 4.600893611882623e-06, + "loss": 0.4005, + "step": 7519, + "task_loss": 0.26467981934547424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48539233207702637, + "epoch": 6.36, + "learning_rate": 4.594855693756793e-06, + "loss": 0.6214, + "step": 7520, + "task_loss": 0.585095226764679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37473398447036743, + "epoch": 6.36, + "learning_rate": 4.588817775630962e-06, + "loss": 0.4132, + "step": 7521, + "task_loss": 0.7003822326660156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4969085156917572, + "epoch": 6.36, + "learning_rate": 4.582779857505133e-06, + "loss": 0.5346, + "step": 7522, + "task_loss": 1.1036744117736816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19755706191062927, + "epoch": 6.36, + "learning_rate": 4.5767419393793026e-06, + "loss": 0.3701, + "step": 7523, + "task_loss": 0.04986334592103958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39888885617256165, + "epoch": 6.36, + "learning_rate": 4.570704021253472e-06, + "loss": 0.4021, + "step": 7524, + "task_loss": 0.4452453553676605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3544149100780487, + "epoch": 6.36, + "learning_rate": 4.564666103127642e-06, + "loss": 0.5224, + "step": 7525, + "task_loss": 0.6101288199424744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3820301592350006, + "epoch": 6.36, + "learning_rate": 4.558628185001811e-06, + "loss": 0.4929, + "step": 7526, + "task_loss": 1.1957510709762573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30141013860702515, + "epoch": 6.36, + "learning_rate": 4.5525902668759815e-06, + "loss": 0.3253, + "step": 7527, + "task_loss": 0.08777723461389542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3219088613986969, + "epoch": 6.36, + "learning_rate": 4.546552348750151e-06, + "loss": 0.4185, + "step": 7528, + "task_loss": 0.29596471786499023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32884734869003296, + "epoch": 6.36, + "learning_rate": 4.540514430624321e-06, + "loss": 0.425, + "step": 7529, + "task_loss": 0.9923633933067322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49735143780708313, + "epoch": 6.36, + "learning_rate": 4.534476512498491e-06, + "loss": 0.5002, + "step": 7530, + "task_loss": 0.2476939857006073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1843915730714798, + "epoch": 6.37, + "learning_rate": 4.52843859437266e-06, + "loss": 0.3579, + "step": 7531, + "task_loss": 0.24017155170440674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2773257791996002, + "epoch": 6.37, + "learning_rate": 4.52240067624683e-06, + "loss": 0.4201, + "step": 7532, + "task_loss": 0.8239900469779968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5478127598762512, + "epoch": 6.37, + "learning_rate": 4.516362758121001e-06, + "loss": 0.5481, + "step": 7533, + "task_loss": 0.3263891339302063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33114004135131836, + "epoch": 6.37, + "learning_rate": 4.5103248399951695e-06, + "loss": 0.385, + "step": 7534, + "task_loss": 0.41055619716644287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4305839240550995, + "epoch": 6.37, + "learning_rate": 4.50428692186934e-06, + "loss": 0.3887, + "step": 7535, + "task_loss": 0.6155098080635071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3318480849266052, + "epoch": 6.37, + "learning_rate": 4.498249003743509e-06, + "loss": 0.4037, + "step": 7536, + "task_loss": 0.1411130428314209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22847118973731995, + "epoch": 6.37, + "learning_rate": 4.4922110856176795e-06, + "loss": 0.3806, + "step": 7537, + "task_loss": 0.18115925788879395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4208347499370575, + "epoch": 6.37, + "learning_rate": 4.486173167491849e-06, + "loss": 0.474, + "step": 7538, + "task_loss": 0.9093618392944336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27996793389320374, + "epoch": 6.37, + "learning_rate": 4.480135249366019e-06, + "loss": 0.5177, + "step": 7539, + "task_loss": 0.9155387878417969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3441300392150879, + "epoch": 6.37, + "learning_rate": 4.474097331240189e-06, + "loss": 0.3253, + "step": 7540, + "task_loss": 0.32349810004234314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24208030104637146, + "epoch": 6.37, + "learning_rate": 4.4680594131143584e-06, + "loss": 0.4382, + "step": 7541, + "task_loss": 0.264790803194046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2643689215183258, + "epoch": 6.38, + "learning_rate": 4.462021494988528e-06, + "loss": 0.3755, + "step": 7542, + "task_loss": 0.27015185356140137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.368619441986084, + "epoch": 6.38, + "learning_rate": 4.455983576862698e-06, + "loss": 0.3966, + "step": 7543, + "task_loss": 0.6519206762313843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3943847417831421, + "epoch": 6.38, + "learning_rate": 4.449945658736868e-06, + "loss": 0.4394, + "step": 7544, + "task_loss": 1.2400013208389282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28756478428840637, + "epoch": 6.38, + "learning_rate": 4.443907740611037e-06, + "loss": 0.3717, + "step": 7545, + "task_loss": 0.7410237193107605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21646368503570557, + "epoch": 6.38, + "learning_rate": 4.437869822485207e-06, + "loss": 0.3485, + "step": 7546, + "task_loss": 0.08190867304801941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35251379013061523, + "epoch": 6.38, + "learning_rate": 4.431831904359377e-06, + "loss": 0.4489, + "step": 7547, + "task_loss": 1.0965979099273682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49258944392204285, + "epoch": 6.38, + "learning_rate": 4.425793986233547e-06, + "loss": 0.4465, + "step": 7548, + "task_loss": 0.7132380604743958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6008623838424683, + "epoch": 6.38, + "learning_rate": 4.419756068107716e-06, + "loss": 0.4425, + "step": 7549, + "task_loss": 0.3820526599884033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.13495850563049316, + "epoch": 6.38, + "learning_rate": 4.413718149981887e-06, + "loss": 0.2814, + "step": 7550, + "task_loss": 0.18869565427303314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4906533360481262, + "epoch": 6.38, + "learning_rate": 4.407680231856056e-06, + "loss": 0.4471, + "step": 7551, + "task_loss": 1.428904414176941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48705172538757324, + "epoch": 6.38, + "learning_rate": 4.401642313730226e-06, + "loss": 0.3429, + "step": 7552, + "task_loss": 0.20902010798454285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.392392635345459, + "epoch": 6.38, + "learning_rate": 4.395604395604396e-06, + "loss": 0.2906, + "step": 7553, + "task_loss": 1.0537656545639038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2439027726650238, + "epoch": 6.39, + "learning_rate": 4.389566477478566e-06, + "loss": 0.4182, + "step": 7554, + "task_loss": 1.2260949611663818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5100665092468262, + "epoch": 6.39, + "learning_rate": 4.383528559352735e-06, + "loss": 0.4576, + "step": 7555, + "task_loss": 0.9553603529930115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3274306654930115, + "epoch": 6.39, + "learning_rate": 4.377490641226905e-06, + "loss": 0.4924, + "step": 7556, + "task_loss": 0.18308515846729279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4941861629486084, + "epoch": 6.39, + "learning_rate": 4.371452723101075e-06, + "loss": 0.519, + "step": 7557, + "task_loss": 0.44950351119041443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3427702486515045, + "epoch": 6.39, + "learning_rate": 4.365414804975245e-06, + "loss": 0.4269, + "step": 7558, + "task_loss": 0.6943057775497437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2340335249900818, + "epoch": 6.39, + "learning_rate": 4.359376886849414e-06, + "loss": 0.3883, + "step": 7559, + "task_loss": 0.8555739521980286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6850919723510742, + "epoch": 6.39, + "learning_rate": 4.353338968723585e-06, + "loss": 0.4789, + "step": 7560, + "task_loss": 0.5528737306594849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3963630497455597, + "epoch": 6.39, + "learning_rate": 4.347301050597754e-06, + "loss": 0.4623, + "step": 7561, + "task_loss": 1.5189728736877441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6989535093307495, + "epoch": 6.39, + "learning_rate": 4.341263132471924e-06, + "loss": 0.3597, + "step": 7562, + "task_loss": 0.6105638742446899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26403138041496277, + "epoch": 6.39, + "learning_rate": 4.335225214346093e-06, + "loss": 0.4618, + "step": 7563, + "task_loss": 0.15609999001026154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5988813638687134, + "epoch": 6.39, + "learning_rate": 4.329187296220264e-06, + "loss": 0.5615, + "step": 7564, + "task_loss": 0.2901882827281952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47591882944107056, + "epoch": 6.39, + "learning_rate": 4.3231493780944335e-06, + "loss": 0.4794, + "step": 7565, + "task_loss": 0.6742881536483765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5205352306365967, + "epoch": 6.4, + "learning_rate": 4.317111459968603e-06, + "loss": 0.4945, + "step": 7566, + "task_loss": 0.741621196269989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38645514845848083, + "epoch": 6.4, + "learning_rate": 4.311073541842773e-06, + "loss": 0.3531, + "step": 7567, + "task_loss": 0.6004335880279541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5187391042709351, + "epoch": 6.4, + "learning_rate": 4.305035623716943e-06, + "loss": 0.4456, + "step": 7568, + "task_loss": 0.7985501885414124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46640944480895996, + "epoch": 6.4, + "learning_rate": 4.298997705591112e-06, + "loss": 0.5245, + "step": 7569, + "task_loss": 0.19216766953468323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35804954171180725, + "epoch": 6.4, + "learning_rate": 4.292959787465282e-06, + "loss": 0.4729, + "step": 7570, + "task_loss": 0.8343668580055237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3087470531463623, + "epoch": 6.4, + "learning_rate": 4.286921869339452e-06, + "loss": 0.4095, + "step": 7571, + "task_loss": 0.1514779031276703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.557770311832428, + "epoch": 6.4, + "learning_rate": 4.280883951213622e-06, + "loss": 0.482, + "step": 7572, + "task_loss": 1.0636582374572754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35276347398757935, + "epoch": 6.4, + "learning_rate": 4.274846033087791e-06, + "loss": 0.4895, + "step": 7573, + "task_loss": 0.872612714767456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32482096552848816, + "epoch": 6.4, + "learning_rate": 4.268808114961961e-06, + "loss": 0.438, + "step": 7574, + "task_loss": 1.0699458122253418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5266396999359131, + "epoch": 6.4, + "learning_rate": 4.262770196836132e-06, + "loss": 0.5185, + "step": 7575, + "task_loss": 1.4329333305358887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5066378116607666, + "epoch": 6.4, + "learning_rate": 4.2567322787103005e-06, + "loss": 0.4627, + "step": 7576, + "task_loss": 0.16584105789661407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3045175075531006, + "epoch": 6.4, + "learning_rate": 4.250694360584471e-06, + "loss": 0.3664, + "step": 7577, + "task_loss": 0.1249660849571228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17197775840759277, + "epoch": 6.41, + "learning_rate": 4.24465644245864e-06, + "loss": 0.3831, + "step": 7578, + "task_loss": 0.28768932819366455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2579072117805481, + "epoch": 6.41, + "learning_rate": 4.2386185243328105e-06, + "loss": 0.3706, + "step": 7579, + "task_loss": 0.6752832531929016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6407304406166077, + "epoch": 6.41, + "learning_rate": 4.23258060620698e-06, + "loss": 0.4383, + "step": 7580, + "task_loss": 0.33479073643684387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6379284858703613, + "epoch": 6.41, + "learning_rate": 4.22654268808115e-06, + "loss": 0.4051, + "step": 7581, + "task_loss": 1.0305145978927612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39257991313934326, + "epoch": 6.41, + "learning_rate": 4.22050476995532e-06, + "loss": 0.3742, + "step": 7582, + "task_loss": 0.3103486895561218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36527305841445923, + "epoch": 6.41, + "learning_rate": 4.214466851829489e-06, + "loss": 0.4225, + "step": 7583, + "task_loss": 0.759556770324707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2443835288286209, + "epoch": 6.41, + "learning_rate": 4.208428933703659e-06, + "loss": 0.3774, + "step": 7584, + "task_loss": 0.44712769985198975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33049914240837097, + "epoch": 6.41, + "learning_rate": 4.20239101557783e-06, + "loss": 0.35, + "step": 7585, + "task_loss": 0.5728400349617004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9545589685440063, + "epoch": 6.41, + "learning_rate": 4.1963530974519986e-06, + "loss": 0.5458, + "step": 7586, + "task_loss": 1.572511911392212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5978804230690002, + "epoch": 6.41, + "learning_rate": 4.190315179326169e-06, + "loss": 0.524, + "step": 7587, + "task_loss": 1.3174279928207397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5475623607635498, + "epoch": 6.41, + "learning_rate": 4.184277261200338e-06, + "loss": 0.4915, + "step": 7588, + "task_loss": 0.8397957682609558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6554060578346252, + "epoch": 6.41, + "learning_rate": 4.178239343074508e-06, + "loss": 0.5275, + "step": 7589, + "task_loss": 1.1806100606918335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4321490228176117, + "epoch": 6.42, + "learning_rate": 4.172201424948678e-06, + "loss": 0.3494, + "step": 7590, + "task_loss": 0.1076197698712349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37346410751342773, + "epoch": 6.42, + "learning_rate": 4.166163506822847e-06, + "loss": 0.3886, + "step": 7591, + "task_loss": 0.7234314680099487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2808346152305603, + "epoch": 6.42, + "learning_rate": 4.160125588697018e-06, + "loss": 0.4969, + "step": 7592, + "task_loss": 0.30329838395118713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4248100519180298, + "epoch": 6.42, + "learning_rate": 4.154087670571187e-06, + "loss": 0.4096, + "step": 7593, + "task_loss": 1.4562383890151978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5851520895957947, + "epoch": 6.42, + "learning_rate": 4.148049752445357e-06, + "loss": 0.4263, + "step": 7594, + "task_loss": 0.4535808563232422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3406076431274414, + "epoch": 6.42, + "learning_rate": 4.142011834319527e-06, + "loss": 0.3837, + "step": 7595, + "task_loss": 0.698113203048706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40253448486328125, + "epoch": 6.42, + "learning_rate": 4.135973916193697e-06, + "loss": 0.4029, + "step": 7596, + "task_loss": 0.19418777525424957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3850042223930359, + "epoch": 6.42, + "learning_rate": 4.129935998067866e-06, + "loss": 0.4169, + "step": 7597, + "task_loss": 0.2745683193206787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.566947340965271, + "epoch": 6.42, + "learning_rate": 4.123898079942036e-06, + "loss": 0.4363, + "step": 7598, + "task_loss": 0.16471554338932037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2935982346534729, + "epoch": 6.42, + "learning_rate": 4.117860161816206e-06, + "loss": 0.3777, + "step": 7599, + "task_loss": 1.0851373672485352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6052749752998352, + "epoch": 6.42, + "learning_rate": 4.1118222436903755e-06, + "loss": 0.4096, + "step": 7600, + "task_loss": 0.6631654500961304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36230379343032837, + "epoch": 6.42, + "learning_rate": 4.105784325564545e-06, + "loss": 0.2496, + "step": 7601, + "task_loss": 1.0661555528640747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.514445424079895, + "epoch": 6.43, + "learning_rate": 4.099746407438716e-06, + "loss": 0.3579, + "step": 7602, + "task_loss": 0.27308452129364014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2838478684425354, + "epoch": 6.43, + "learning_rate": 4.093708489312885e-06, + "loss": 0.454, + "step": 7603, + "task_loss": 0.3346703350543976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21703946590423584, + "epoch": 6.43, + "learning_rate": 4.087670571187055e-06, + "loss": 0.3625, + "step": 7604, + "task_loss": 0.6335638165473938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31049904227256775, + "epoch": 6.43, + "learning_rate": 4.081632653061224e-06, + "loss": 0.3488, + "step": 7605, + "task_loss": 0.18147899210453033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.16660843789577484, + "epoch": 6.43, + "learning_rate": 4.075594734935395e-06, + "loss": 0.3058, + "step": 7606, + "task_loss": 0.555975615978241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36628010869026184, + "epoch": 6.43, + "learning_rate": 4.0695568168095645e-06, + "loss": 0.3826, + "step": 7607, + "task_loss": 0.4056656062602997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36825108528137207, + "epoch": 6.43, + "learning_rate": 4.063518898683734e-06, + "loss": 0.429, + "step": 7608, + "task_loss": 1.3975794315338135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.254421591758728, + "epoch": 6.43, + "learning_rate": 4.057480980557904e-06, + "loss": 0.454, + "step": 7609, + "task_loss": 1.0104660987854004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6730448603630066, + "epoch": 6.43, + "learning_rate": 4.051443062432074e-06, + "loss": 0.377, + "step": 7610, + "task_loss": 0.6257966160774231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46320852637290955, + "epoch": 6.43, + "learning_rate": 4.045405144306243e-06, + "loss": 0.3921, + "step": 7611, + "task_loss": 0.8142383694648743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.264749139547348, + "epoch": 6.43, + "learning_rate": 4.039367226180413e-06, + "loss": 0.3795, + "step": 7612, + "task_loss": 0.5733726620674133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4003748595714569, + "epoch": 6.44, + "learning_rate": 4.033329308054583e-06, + "loss": 0.4301, + "step": 7613, + "task_loss": 0.23237332701683044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4649609923362732, + "epoch": 6.44, + "learning_rate": 4.0272913899287525e-06, + "loss": 0.4972, + "step": 7614, + "task_loss": 1.7550774812698364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6407753229141235, + "epoch": 6.44, + "learning_rate": 4.021253471802922e-06, + "loss": 0.58, + "step": 7615, + "task_loss": 0.4978964030742645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30274927616119385, + "epoch": 6.44, + "learning_rate": 4.015215553677092e-06, + "loss": 0.5138, + "step": 7616, + "task_loss": 1.0885311365127563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6557941436767578, + "epoch": 6.44, + "learning_rate": 4.0091776355512625e-06, + "loss": 0.4202, + "step": 7617, + "task_loss": 0.3223996162414551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5106455087661743, + "epoch": 6.44, + "learning_rate": 4.0031397174254314e-06, + "loss": 0.4253, + "step": 7618, + "task_loss": 0.7875860333442688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39976930618286133, + "epoch": 6.44, + "learning_rate": 3.997101799299602e-06, + "loss": 0.4628, + "step": 7619, + "task_loss": 1.2734344005584717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3517210781574249, + "epoch": 6.44, + "learning_rate": 3.991063881173771e-06, + "loss": 0.3506, + "step": 7620, + "task_loss": 0.6845234632492065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2996918559074402, + "epoch": 6.44, + "learning_rate": 3.9850259630479414e-06, + "loss": 0.421, + "step": 7621, + "task_loss": 0.09655660390853882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4344547688961029, + "epoch": 6.44, + "learning_rate": 3.978988044922111e-06, + "loss": 0.4774, + "step": 7622, + "task_loss": 0.5702031850814819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44666552543640137, + "epoch": 6.44, + "learning_rate": 3.972950126796281e-06, + "loss": 0.4365, + "step": 7623, + "task_loss": 0.7123099565505981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5679116249084473, + "epoch": 6.44, + "learning_rate": 3.966912208670451e-06, + "loss": 0.3733, + "step": 7624, + "task_loss": 0.661853015422821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34131571650505066, + "epoch": 6.45, + "learning_rate": 3.96087429054462e-06, + "loss": 0.4266, + "step": 7625, + "task_loss": 0.3214270770549774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4605754613876343, + "epoch": 6.45, + "learning_rate": 3.95483637241879e-06, + "loss": 0.4444, + "step": 7626, + "task_loss": 0.5668392181396484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5282625555992126, + "epoch": 6.45, + "learning_rate": 3.948798454292961e-06, + "loss": 0.5315, + "step": 7627, + "task_loss": 0.7395278215408325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3747004270553589, + "epoch": 6.45, + "learning_rate": 3.9427605361671295e-06, + "loss": 0.4781, + "step": 7628, + "task_loss": 0.5778380036354065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27020007371902466, + "epoch": 6.45, + "learning_rate": 3.9367226180413e-06, + "loss": 0.3376, + "step": 7629, + "task_loss": 0.521760106086731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5351536870002747, + "epoch": 6.45, + "learning_rate": 3.930684699915469e-06, + "loss": 0.5025, + "step": 7630, + "task_loss": 0.667941689491272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4667362570762634, + "epoch": 6.45, + "learning_rate": 3.9246467817896395e-06, + "loss": 0.5077, + "step": 7631, + "task_loss": 0.061607230454683304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4018675684928894, + "epoch": 6.45, + "learning_rate": 3.918608863663809e-06, + "loss": 0.4813, + "step": 7632, + "task_loss": 1.7519304752349854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5138986110687256, + "epoch": 6.45, + "learning_rate": 3.912570945537979e-06, + "loss": 0.4764, + "step": 7633, + "task_loss": 0.4051055610179901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2937704920768738, + "epoch": 6.45, + "learning_rate": 3.906533027412149e-06, + "loss": 0.3064, + "step": 7634, + "task_loss": 0.8094771504402161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.385277658700943, + "epoch": 6.45, + "learning_rate": 3.9004951092863184e-06, + "loss": 0.4183, + "step": 7635, + "task_loss": 0.8220266103744507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3171766400337219, + "epoch": 6.45, + "learning_rate": 3.894457191160488e-06, + "loss": 0.3405, + "step": 7636, + "task_loss": 0.21889159083366394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17481842637062073, + "epoch": 6.46, + "learning_rate": 3.888419273034658e-06, + "loss": 0.5309, + "step": 7637, + "task_loss": 0.6263213157653809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3915245831012726, + "epoch": 6.46, + "learning_rate": 3.882381354908828e-06, + "loss": 0.418, + "step": 7638, + "task_loss": 1.416357398033142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5309879183769226, + "epoch": 6.46, + "learning_rate": 3.876343436782997e-06, + "loss": 0.7501, + "step": 7639, + "task_loss": 0.47290876507759094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27492040395736694, + "epoch": 6.46, + "learning_rate": 3.870305518657167e-06, + "loss": 0.3447, + "step": 7640, + "task_loss": 0.3564997911453247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23887483775615692, + "epoch": 6.46, + "learning_rate": 3.864267600531337e-06, + "loss": 0.3217, + "step": 7641, + "task_loss": 0.46908557415008545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47848498821258545, + "epoch": 6.46, + "learning_rate": 3.858229682405507e-06, + "loss": 0.438, + "step": 7642, + "task_loss": 0.3249066472053528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.651793360710144, + "epoch": 6.46, + "learning_rate": 3.852191764279676e-06, + "loss": 0.4527, + "step": 7643, + "task_loss": 1.4709583520889282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38558194041252136, + "epoch": 6.46, + "learning_rate": 3.846153846153847e-06, + "loss": 0.3581, + "step": 7644, + "task_loss": 0.15202544629573822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2086794078350067, + "epoch": 6.46, + "learning_rate": 3.840115928028016e-06, + "loss": 0.3344, + "step": 7645, + "task_loss": 0.46285197138786316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34382709860801697, + "epoch": 6.46, + "learning_rate": 3.834078009902186e-06, + "loss": 0.4527, + "step": 7646, + "task_loss": 0.3009686768054962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39628106355667114, + "epoch": 6.46, + "learning_rate": 3.828040091776355e-06, + "loss": 0.2783, + "step": 7647, + "task_loss": 0.2661716639995575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42240530252456665, + "epoch": 6.46, + "learning_rate": 3.822002173650526e-06, + "loss": 0.3717, + "step": 7648, + "task_loss": 1.3972136974334717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2487010955810547, + "epoch": 6.47, + "learning_rate": 3.815964255524695e-06, + "loss": 0.2774, + "step": 7649, + "task_loss": 0.24234063923358917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5285531878471375, + "epoch": 6.47, + "learning_rate": 3.8099263373988647e-06, + "loss": 0.4504, + "step": 7650, + "task_loss": 0.22108548879623413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25214970111846924, + "epoch": 6.47, + "learning_rate": 3.803888419273035e-06, + "loss": 0.4017, + "step": 7651, + "task_loss": 0.8750590085983276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34087246656417847, + "epoch": 6.47, + "learning_rate": 3.797850501147204e-06, + "loss": 0.4213, + "step": 7652, + "task_loss": 0.6880748867988586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4011131823062897, + "epoch": 6.47, + "learning_rate": 3.7918125830213743e-06, + "loss": 0.4774, + "step": 7653, + "task_loss": 0.24854843318462372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3821079730987549, + "epoch": 6.47, + "learning_rate": 3.7857746648955445e-06, + "loss": 0.4651, + "step": 7654, + "task_loss": 0.44583743810653687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27302879095077515, + "epoch": 6.47, + "learning_rate": 3.7797367467697138e-06, + "loss": 0.294, + "step": 7655, + "task_loss": 0.29399457573890686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5970277786254883, + "epoch": 6.47, + "learning_rate": 3.773698828643884e-06, + "loss": 0.3895, + "step": 7656, + "task_loss": 0.6561475396156311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34838950634002686, + "epoch": 6.47, + "learning_rate": 3.767660910518053e-06, + "loss": 0.44, + "step": 7657, + "task_loss": 0.1460404098033905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4720696210861206, + "epoch": 6.47, + "learning_rate": 3.7616229923922234e-06, + "loss": 0.455, + "step": 7658, + "task_loss": 1.0234571695327759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40535807609558105, + "epoch": 6.47, + "learning_rate": 3.7555850742663935e-06, + "loss": 0.3907, + "step": 7659, + "task_loss": 0.1924181878566742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46643105149269104, + "epoch": 6.47, + "learning_rate": 3.749547156140563e-06, + "loss": 0.3699, + "step": 7660, + "task_loss": 0.3735228180885315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21229948103427887, + "epoch": 6.48, + "learning_rate": 3.743509238014733e-06, + "loss": 0.358, + "step": 7661, + "task_loss": 0.3932367265224457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2838762402534485, + "epoch": 6.48, + "learning_rate": 3.7374713198889023e-06, + "loss": 0.4374, + "step": 7662, + "task_loss": 0.6160967350006104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4643041491508484, + "epoch": 6.48, + "learning_rate": 3.7314334017630724e-06, + "loss": 0.5235, + "step": 7663, + "task_loss": 0.37698498368263245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3250167965888977, + "epoch": 6.48, + "learning_rate": 3.725395483637242e-06, + "loss": 0.5315, + "step": 7664, + "task_loss": 0.5543719530105591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4892522990703583, + "epoch": 6.48, + "learning_rate": 3.719357565511412e-06, + "loss": 0.4295, + "step": 7665, + "task_loss": 1.6850335597991943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5706751346588135, + "epoch": 6.48, + "learning_rate": 3.7133196473855816e-06, + "loss": 0.5473, + "step": 7666, + "task_loss": 0.9194045066833496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28616926074028015, + "epoch": 6.48, + "learning_rate": 3.7072817292597513e-06, + "loss": 0.4069, + "step": 7667, + "task_loss": 0.625519871711731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39594894647598267, + "epoch": 6.48, + "learning_rate": 3.701243811133921e-06, + "loss": 0.4118, + "step": 7668, + "task_loss": 0.5988566279411316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26500722765922546, + "epoch": 6.48, + "learning_rate": 3.695205893008091e-06, + "loss": 0.3213, + "step": 7669, + "task_loss": 0.35659363865852356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3120731711387634, + "epoch": 6.48, + "learning_rate": 3.6891679748822605e-06, + "loss": 0.3607, + "step": 7670, + "task_loss": 1.3844609260559082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35540738701820374, + "epoch": 6.48, + "learning_rate": 3.6831300567564306e-06, + "loss": 0.4473, + "step": 7671, + "task_loss": 0.831452488899231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44631096720695496, + "epoch": 6.48, + "learning_rate": 3.6770921386306e-06, + "loss": 0.4504, + "step": 7672, + "task_loss": 0.3051972985267639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24287714064121246, + "epoch": 6.49, + "learning_rate": 3.67105422050477e-06, + "loss": 0.3644, + "step": 7673, + "task_loss": 0.20481258630752563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3949527144432068, + "epoch": 6.49, + "learning_rate": 3.66501630237894e-06, + "loss": 0.3633, + "step": 7674, + "task_loss": 0.4955171048641205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4386295676231384, + "epoch": 6.49, + "learning_rate": 3.6589783842531095e-06, + "loss": 0.3712, + "step": 7675, + "task_loss": 0.21991212666034698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3999235928058624, + "epoch": 6.49, + "learning_rate": 3.6529404661272797e-06, + "loss": 0.3757, + "step": 7676, + "task_loss": 0.41477474570274353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3232361674308777, + "epoch": 6.49, + "learning_rate": 3.646902548001449e-06, + "loss": 0.4137, + "step": 7677, + "task_loss": 0.0927613154053688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35872411727905273, + "epoch": 6.49, + "learning_rate": 3.640864629875619e-06, + "loss": 0.4782, + "step": 7678, + "task_loss": 0.6009278893470764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3037814497947693, + "epoch": 6.49, + "learning_rate": 3.6348267117497893e-06, + "loss": 0.4365, + "step": 7679, + "task_loss": 0.42444831132888794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3644055128097534, + "epoch": 6.49, + "learning_rate": 3.6287887936239586e-06, + "loss": 0.5637, + "step": 7680, + "task_loss": 0.8057777285575867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22185848653316498, + "epoch": 6.49, + "learning_rate": 3.6227508754981287e-06, + "loss": 0.307, + "step": 7681, + "task_loss": 0.08156996965408325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48345449566841125, + "epoch": 6.49, + "learning_rate": 3.616712957372298e-06, + "loss": 0.4428, + "step": 7682, + "task_loss": 0.08626305311918259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2031942903995514, + "epoch": 6.49, + "learning_rate": 3.610675039246468e-06, + "loss": 0.4034, + "step": 7683, + "task_loss": 0.012371708638966084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2889648675918579, + "epoch": 6.5, + "learning_rate": 3.6046371211206383e-06, + "loss": 0.4895, + "step": 7684, + "task_loss": 0.6243511438369751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30407506227493286, + "epoch": 6.5, + "learning_rate": 3.5985992029948076e-06, + "loss": 0.4189, + "step": 7685, + "task_loss": 0.8658360838890076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23634982109069824, + "epoch": 6.5, + "learning_rate": 3.5925612848689777e-06, + "loss": 0.3804, + "step": 7686, + "task_loss": 0.33452314138412476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3774471580982208, + "epoch": 6.5, + "learning_rate": 3.586523366743147e-06, + "loss": 0.3363, + "step": 7687, + "task_loss": 0.8340803384780884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4032588303089142, + "epoch": 6.5, + "learning_rate": 3.580485448617317e-06, + "loss": 0.3953, + "step": 7688, + "task_loss": 0.8952805995941162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2760763168334961, + "epoch": 6.5, + "learning_rate": 3.5744475304914865e-06, + "loss": 0.387, + "step": 7689, + "task_loss": 0.5491968393325806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4760347306728363, + "epoch": 6.5, + "learning_rate": 3.5684096123656566e-06, + "loss": 0.4099, + "step": 7690, + "task_loss": 1.4676611423492432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4849081337451935, + "epoch": 6.5, + "learning_rate": 3.5623716942398264e-06, + "loss": 0.4279, + "step": 7691, + "task_loss": 1.237672209739685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.18625405430793762, + "epoch": 6.5, + "learning_rate": 3.5563337761139957e-06, + "loss": 0.4516, + "step": 7692, + "task_loss": 0.06318674981594086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5162789821624756, + "epoch": 6.5, + "learning_rate": 3.550295857988166e-06, + "loss": 0.4514, + "step": 7693, + "task_loss": 0.5883114337921143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23461666703224182, + "epoch": 6.5, + "learning_rate": 3.544257939862335e-06, + "loss": 0.2843, + "step": 7694, + "task_loss": 0.3116132318973541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29992425441741943, + "epoch": 6.5, + "learning_rate": 3.5382200217365053e-06, + "loss": 0.3473, + "step": 7695, + "task_loss": 0.31048455834388733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5551434755325317, + "epoch": 6.51, + "learning_rate": 3.5321821036106754e-06, + "loss": 0.443, + "step": 7696, + "task_loss": 1.0268428325653076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4671921730041504, + "epoch": 6.51, + "learning_rate": 3.5261441854848447e-06, + "loss": 0.4429, + "step": 7697, + "task_loss": 1.5239801406860352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7202664017677307, + "epoch": 6.51, + "learning_rate": 3.520106267359015e-06, + "loss": 0.4115, + "step": 7698, + "task_loss": 0.8783277869224548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2792589068412781, + "epoch": 6.51, + "learning_rate": 3.514068349233184e-06, + "loss": 0.4367, + "step": 7699, + "task_loss": 0.2987062633037567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4970380663871765, + "epoch": 6.51, + "learning_rate": 3.5080304311073543e-06, + "loss": 0.3929, + "step": 7700, + "task_loss": 0.6271637678146362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5262926816940308, + "epoch": 6.51, + "learning_rate": 3.5019925129815245e-06, + "loss": 0.5123, + "step": 7701, + "task_loss": 0.20182503759860992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24873396754264832, + "epoch": 6.51, + "learning_rate": 3.4959545948556938e-06, + "loss": 0.3899, + "step": 7702, + "task_loss": 0.3961564600467682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39284196496009827, + "epoch": 6.51, + "learning_rate": 3.489916676729864e-06, + "loss": 0.3842, + "step": 7703, + "task_loss": 0.5264989733695984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2523111402988434, + "epoch": 6.51, + "learning_rate": 3.483878758604033e-06, + "loss": 0.427, + "step": 7704, + "task_loss": 0.5021206736564636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6154608726501465, + "epoch": 6.51, + "learning_rate": 3.4778408404782034e-06, + "loss": 0.4715, + "step": 7705, + "task_loss": 0.3391534984111786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6082305908203125, + "epoch": 6.51, + "learning_rate": 3.4718029223523735e-06, + "loss": 0.3934, + "step": 7706, + "task_loss": 0.3663659989833832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32460853457450867, + "epoch": 6.51, + "learning_rate": 3.465765004226543e-06, + "loss": 0.3508, + "step": 7707, + "task_loss": 0.7027719616889954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5089862942695618, + "epoch": 6.52, + "learning_rate": 3.459727086100713e-06, + "loss": 0.4111, + "step": 7708, + "task_loss": 1.740857481956482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2740253508090973, + "epoch": 6.52, + "learning_rate": 3.4536891679748822e-06, + "loss": 0.4096, + "step": 7709, + "task_loss": 0.4862251877784729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8225532174110413, + "epoch": 6.52, + "learning_rate": 3.4476512498490524e-06, + "loss": 0.5542, + "step": 7710, + "task_loss": 0.852367103099823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4984482526779175, + "epoch": 6.52, + "learning_rate": 3.441613331723222e-06, + "loss": 0.5627, + "step": 7711, + "task_loss": 0.2050698697566986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25185441970825195, + "epoch": 6.52, + "learning_rate": 3.435575413597392e-06, + "loss": 0.3759, + "step": 7712, + "task_loss": 0.8190176486968994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2851306200027466, + "epoch": 6.52, + "learning_rate": 3.4295374954715616e-06, + "loss": 0.4244, + "step": 7713, + "task_loss": 0.3571352958679199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3931918740272522, + "epoch": 6.52, + "learning_rate": 3.4234995773457313e-06, + "loss": 0.4807, + "step": 7714, + "task_loss": 0.5169183015823364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3308052122592926, + "epoch": 6.52, + "learning_rate": 3.417461659219901e-06, + "loss": 0.3833, + "step": 7715, + "task_loss": 0.9095100164413452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2579752802848816, + "epoch": 6.52, + "learning_rate": 3.411423741094071e-06, + "loss": 0.4206, + "step": 7716, + "task_loss": 0.39869749546051025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24683046340942383, + "epoch": 6.52, + "learning_rate": 3.4053858229682405e-06, + "loss": 0.3573, + "step": 7717, + "task_loss": 0.29481446743011475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5046948194503784, + "epoch": 6.52, + "learning_rate": 3.3993479048424106e-06, + "loss": 0.5402, + "step": 7718, + "task_loss": 0.5728384852409363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2638630270957947, + "epoch": 6.52, + "learning_rate": 3.39330998671658e-06, + "loss": 0.4007, + "step": 7719, + "task_loss": 0.6914132833480835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3226665258407593, + "epoch": 6.53, + "learning_rate": 3.38727206859075e-06, + "loss": 0.431, + "step": 7720, + "task_loss": 0.8165730834007263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6342872381210327, + "epoch": 6.53, + "learning_rate": 3.38123415046492e-06, + "loss": 0.4441, + "step": 7721, + "task_loss": 0.3512526750564575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5903358459472656, + "epoch": 6.53, + "learning_rate": 3.3751962323390895e-06, + "loss": 0.5681, + "step": 7722, + "task_loss": 0.7592611908912659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2888670861721039, + "epoch": 6.53, + "learning_rate": 3.3691583142132597e-06, + "loss": 0.4407, + "step": 7723, + "task_loss": 0.12918084859848022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2667515277862549, + "epoch": 6.53, + "learning_rate": 3.363120396087429e-06, + "loss": 0.4991, + "step": 7724, + "task_loss": 0.2222709357738495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31435298919677734, + "epoch": 6.53, + "learning_rate": 3.357082477961599e-06, + "loss": 0.4506, + "step": 7725, + "task_loss": 0.9514991641044617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25086653232574463, + "epoch": 6.53, + "learning_rate": 3.3510445598357693e-06, + "loss": 0.4223, + "step": 7726, + "task_loss": 0.5273663997650146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3931138515472412, + "epoch": 6.53, + "learning_rate": 3.3450066417099386e-06, + "loss": 0.3153, + "step": 7727, + "task_loss": 0.20566728711128235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31249791383743286, + "epoch": 6.53, + "learning_rate": 3.3389687235841087e-06, + "loss": 0.3433, + "step": 7728, + "task_loss": 0.02320913039147854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4897136688232422, + "epoch": 6.53, + "learning_rate": 3.332930805458278e-06, + "loss": 0.3873, + "step": 7729, + "task_loss": 0.4566836953163147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2944187819957733, + "epoch": 6.53, + "learning_rate": 3.326892887332448e-06, + "loss": 0.3735, + "step": 7730, + "task_loss": 0.5994160175323486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29624760150909424, + "epoch": 6.53, + "learning_rate": 3.3208549692066174e-06, + "loss": 0.3797, + "step": 7731, + "task_loss": 0.6483547687530518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28410589694976807, + "epoch": 6.54, + "learning_rate": 3.3148170510807876e-06, + "loss": 0.3718, + "step": 7732, + "task_loss": 0.9737540483474731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4408337473869324, + "epoch": 6.54, + "learning_rate": 3.3087791329549573e-06, + "loss": 0.3817, + "step": 7733, + "task_loss": 0.7905225157737732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5130460858345032, + "epoch": 6.54, + "learning_rate": 3.302741214829127e-06, + "loss": 0.4205, + "step": 7734, + "task_loss": 0.6533775329589844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27270805835723877, + "epoch": 6.54, + "learning_rate": 3.2967032967032968e-06, + "loss": 0.4238, + "step": 7735, + "task_loss": 0.3389483690261841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5491988062858582, + "epoch": 6.54, + "learning_rate": 3.2906653785774665e-06, + "loss": 0.3859, + "step": 7736, + "task_loss": 1.2513906955718994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8034136295318604, + "epoch": 6.54, + "learning_rate": 3.2846274604516362e-06, + "loss": 0.5724, + "step": 7737, + "task_loss": 1.2257740497589111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3546361029148102, + "epoch": 6.54, + "learning_rate": 3.2785895423258064e-06, + "loss": 0.3237, + "step": 7738, + "task_loss": 0.7468657493591309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3124690353870392, + "epoch": 6.54, + "learning_rate": 3.2725516241999757e-06, + "loss": 0.3118, + "step": 7739, + "task_loss": 0.31462687253952026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2261171042919159, + "epoch": 6.54, + "learning_rate": 3.266513706074146e-06, + "loss": 0.3055, + "step": 7740, + "task_loss": 0.470099538564682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2768022418022156, + "epoch": 6.54, + "learning_rate": 3.260475787948315e-06, + "loss": 0.3323, + "step": 7741, + "task_loss": 0.6580920219421387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23272868990898132, + "epoch": 6.54, + "learning_rate": 3.2544378698224853e-06, + "loss": 0.298, + "step": 7742, + "task_loss": 0.17747752368450165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3885907530784607, + "epoch": 6.54, + "learning_rate": 3.2483999516966554e-06, + "loss": 0.3377, + "step": 7743, + "task_loss": 0.3679782450199127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5739710330963135, + "epoch": 6.55, + "learning_rate": 3.2423620335708247e-06, + "loss": 0.5162, + "step": 7744, + "task_loss": 0.607576310634613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2692696154117584, + "epoch": 6.55, + "learning_rate": 3.236324115444995e-06, + "loss": 0.3574, + "step": 7745, + "task_loss": 0.5703031420707703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2844216227531433, + "epoch": 6.55, + "learning_rate": 3.230286197319164e-06, + "loss": 0.4341, + "step": 7746, + "task_loss": 0.146365687251091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.678672194480896, + "epoch": 6.55, + "learning_rate": 3.2242482791933343e-06, + "loss": 0.5235, + "step": 7747, + "task_loss": 1.024153709411621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29055529832839966, + "epoch": 6.55, + "learning_rate": 3.2182103610675045e-06, + "loss": 0.3627, + "step": 7748, + "task_loss": 0.08211304247379303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4737032949924469, + "epoch": 6.55, + "learning_rate": 3.2121724429416738e-06, + "loss": 0.584, + "step": 7749, + "task_loss": 0.9777902960777283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27337443828582764, + "epoch": 6.55, + "learning_rate": 3.206134524815844e-06, + "loss": 0.3719, + "step": 7750, + "task_loss": 0.5225952863693237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24841195344924927, + "epoch": 6.55, + "learning_rate": 3.200096606690013e-06, + "loss": 0.4198, + "step": 7751, + "task_loss": 0.3015982210636139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3621988892555237, + "epoch": 6.55, + "learning_rate": 3.1940586885641833e-06, + "loss": 0.5122, + "step": 7752, + "task_loss": 1.0003283023834229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4399791359901428, + "epoch": 6.55, + "learning_rate": 3.1880207704383535e-06, + "loss": 0.4226, + "step": 7753, + "task_loss": 0.5162055492401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5156853795051575, + "epoch": 6.55, + "learning_rate": 3.181982852312523e-06, + "loss": 0.518, + "step": 7754, + "task_loss": 1.6041244268417358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7884372472763062, + "epoch": 6.56, + "learning_rate": 3.1759449341866925e-06, + "loss": 0.4916, + "step": 7755, + "task_loss": 1.0826177597045898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4020611047744751, + "epoch": 6.56, + "learning_rate": 3.1699070160608622e-06, + "loss": 0.356, + "step": 7756, + "task_loss": 0.4561913311481476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3271491229534149, + "epoch": 6.56, + "learning_rate": 3.163869097935032e-06, + "loss": 0.4042, + "step": 7757, + "task_loss": 0.5117846727371216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47055765986442566, + "epoch": 6.56, + "learning_rate": 3.157831179809202e-06, + "loss": 0.3912, + "step": 7758, + "task_loss": 0.9024670124053955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21036764979362488, + "epoch": 6.56, + "learning_rate": 3.1517932616833714e-06, + "loss": 0.3513, + "step": 7759, + "task_loss": 0.45168206095695496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1887086033821106, + "epoch": 6.56, + "learning_rate": 3.1457553435575416e-06, + "loss": 0.4591, + "step": 7760, + "task_loss": 0.3421610891819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39320677518844604, + "epoch": 6.56, + "learning_rate": 3.139717425431711e-06, + "loss": 0.3975, + "step": 7761, + "task_loss": 0.8622456789016724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3508971929550171, + "epoch": 6.56, + "learning_rate": 3.133679507305881e-06, + "loss": 0.3903, + "step": 7762, + "task_loss": 0.5515611171722412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4156569540500641, + "epoch": 6.56, + "learning_rate": 3.127641589180051e-06, + "loss": 0.4597, + "step": 7763, + "task_loss": 0.5957874655723572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.528780460357666, + "epoch": 6.56, + "learning_rate": 3.1216036710542205e-06, + "loss": 0.4121, + "step": 7764, + "task_loss": 0.8322903513908386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27483171224594116, + "epoch": 6.56, + "learning_rate": 3.11556575292839e-06, + "loss": 0.4193, + "step": 7765, + "task_loss": 0.7760427594184875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43639183044433594, + "epoch": 6.56, + "learning_rate": 3.1095278348025603e-06, + "loss": 0.4458, + "step": 7766, + "task_loss": 0.25227904319763184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27535173296928406, + "epoch": 6.57, + "learning_rate": 3.10348991667673e-06, + "loss": 0.3113, + "step": 7767, + "task_loss": 0.27668142318725586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1979883462190628, + "epoch": 6.57, + "learning_rate": 3.0974519985508998e-06, + "loss": 0.4337, + "step": 7768, + "task_loss": 0.5311219692230225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.11772812157869339, + "epoch": 6.57, + "learning_rate": 3.0914140804250695e-06, + "loss": 0.2962, + "step": 7769, + "task_loss": 0.006232676561921835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42099103331565857, + "epoch": 6.57, + "learning_rate": 3.0853761622992392e-06, + "loss": 0.4041, + "step": 7770, + "task_loss": 0.31587886810302734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4458006024360657, + "epoch": 6.57, + "learning_rate": 3.0793382441734094e-06, + "loss": 0.408, + "step": 7771, + "task_loss": 0.9738448262214661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2247922420501709, + "epoch": 6.57, + "learning_rate": 3.073300326047579e-06, + "loss": 0.336, + "step": 7772, + "task_loss": 0.34269025921821594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7674140334129333, + "epoch": 6.57, + "learning_rate": 3.067262407921749e-06, + "loss": 0.6021, + "step": 7773, + "task_loss": 1.3403419256210327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4358260929584503, + "epoch": 6.57, + "learning_rate": 3.0612244897959185e-06, + "loss": 0.5695, + "step": 7774, + "task_loss": 0.6478825807571411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.515540599822998, + "epoch": 6.57, + "learning_rate": 3.0551865716700883e-06, + "loss": 0.3508, + "step": 7775, + "task_loss": 0.5011901259422302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4169060289859772, + "epoch": 6.57, + "learning_rate": 3.0491486535442584e-06, + "loss": 0.4068, + "step": 7776, + "task_loss": 1.208677053451538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42060232162475586, + "epoch": 6.57, + "learning_rate": 3.043110735418428e-06, + "loss": 0.4037, + "step": 7777, + "task_loss": 0.623650848865509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41231703758239746, + "epoch": 6.57, + "learning_rate": 3.037072817292598e-06, + "loss": 0.3776, + "step": 7778, + "task_loss": 0.9123532772064209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42431432008743286, + "epoch": 6.58, + "learning_rate": 3.0310348991667676e-06, + "loss": 0.4698, + "step": 7779, + "task_loss": 0.7035282850265503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5214707851409912, + "epoch": 6.58, + "learning_rate": 3.0249969810409373e-06, + "loss": 0.4172, + "step": 7780, + "task_loss": 0.715429961681366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3027152717113495, + "epoch": 6.58, + "learning_rate": 3.018959062915107e-06, + "loss": 0.4268, + "step": 7781, + "task_loss": 0.8404766321182251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38902848958969116, + "epoch": 6.58, + "learning_rate": 3.0129211447892768e-06, + "loss": 0.3811, + "step": 7782, + "task_loss": 0.07236456125974655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5380887389183044, + "epoch": 6.58, + "learning_rate": 3.0068832266634465e-06, + "loss": 0.418, + "step": 7783, + "task_loss": 1.2045073509216309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40223369002342224, + "epoch": 6.58, + "learning_rate": 3.0008453085376162e-06, + "loss": 0.524, + "step": 7784, + "task_loss": 0.7834155559539795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4357958137989044, + "epoch": 6.58, + "learning_rate": 2.994807390411786e-06, + "loss": 0.3435, + "step": 7785, + "task_loss": 0.3571382164955139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3943432569503784, + "epoch": 6.58, + "learning_rate": 2.9887694722859557e-06, + "loss": 0.4797, + "step": 7786, + "task_loss": 1.0196900367736816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6913484930992126, + "epoch": 6.58, + "learning_rate": 2.982731554160126e-06, + "loss": 0.4286, + "step": 7787, + "task_loss": 1.1754988431930542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3823729455471039, + "epoch": 6.58, + "learning_rate": 2.9766936360342955e-06, + "loss": 0.3534, + "step": 7788, + "task_loss": 0.5555999875068665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4940057694911957, + "epoch": 6.58, + "learning_rate": 2.9706557179084653e-06, + "loss": 0.3469, + "step": 7789, + "task_loss": 0.5420287251472473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24496492743492126, + "epoch": 6.58, + "learning_rate": 2.964617799782635e-06, + "loss": 0.3917, + "step": 7790, + "task_loss": 0.150486022233963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.496233195066452, + "epoch": 6.59, + "learning_rate": 2.9585798816568047e-06, + "loss": 0.3905, + "step": 7791, + "task_loss": 0.7251565456390381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5589656233787537, + "epoch": 6.59, + "learning_rate": 2.952541963530975e-06, + "loss": 0.4376, + "step": 7792, + "task_loss": 1.370635986328125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.433832585811615, + "epoch": 6.59, + "learning_rate": 2.9465040454051446e-06, + "loss": 0.4523, + "step": 7793, + "task_loss": 0.8142435550689697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4779830574989319, + "epoch": 6.59, + "learning_rate": 2.9404661272793143e-06, + "loss": 0.5005, + "step": 7794, + "task_loss": 0.6257989406585693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34620389342308044, + "epoch": 6.59, + "learning_rate": 2.934428209153484e-06, + "loss": 0.4298, + "step": 7795, + "task_loss": 0.3199808597564697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27027735114097595, + "epoch": 6.59, + "learning_rate": 2.9283902910276537e-06, + "loss": 0.3958, + "step": 7796, + "task_loss": 0.6207057237625122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4141535758972168, + "epoch": 6.59, + "learning_rate": 2.922352372901824e-06, + "loss": 0.3452, + "step": 7797, + "task_loss": 0.3256186246871948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33471545577049255, + "epoch": 6.59, + "learning_rate": 2.9163144547759936e-06, + "loss": 0.379, + "step": 7798, + "task_loss": 0.5102890133857727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33844488859176636, + "epoch": 6.59, + "learning_rate": 2.9102765366501633e-06, + "loss": 0.4639, + "step": 7799, + "task_loss": 0.4191419184207916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.15159349143505096, + "epoch": 6.59, + "learning_rate": 2.904238618524333e-06, + "loss": 0.4018, + "step": 7800, + "task_loss": 0.04137583449482918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.466411828994751, + "epoch": 6.59, + "learning_rate": 2.898200700398503e-06, + "loss": 0.5089, + "step": 7801, + "task_loss": 0.43680641055107117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37998342514038086, + "epoch": 6.59, + "learning_rate": 2.8921627822726725e-06, + "loss": 0.4395, + "step": 7802, + "task_loss": 0.6249713897705078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5105043649673462, + "epoch": 6.6, + "learning_rate": 2.8861248641468422e-06, + "loss": 0.4452, + "step": 7803, + "task_loss": 0.8355153799057007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3620811104774475, + "epoch": 6.6, + "learning_rate": 2.880086946021012e-06, + "loss": 0.4232, + "step": 7804, + "task_loss": 0.5232028365135193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4444090723991394, + "epoch": 6.6, + "learning_rate": 2.8740490278951817e-06, + "loss": 0.4479, + "step": 7805, + "task_loss": 0.5205304026603699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5998578667640686, + "epoch": 6.6, + "learning_rate": 2.8680111097693514e-06, + "loss": 0.5906, + "step": 7806, + "task_loss": 1.0072587728500366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6915017366409302, + "epoch": 6.6, + "learning_rate": 2.8619731916435216e-06, + "loss": 0.4136, + "step": 7807, + "task_loss": 1.1984752416610718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4589049220085144, + "epoch": 6.6, + "learning_rate": 2.8559352735176913e-06, + "loss": 0.501, + "step": 7808, + "task_loss": 0.5365243554115295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3888126313686371, + "epoch": 6.6, + "learning_rate": 2.849897355391861e-06, + "loss": 0.3624, + "step": 7809, + "task_loss": 1.0376373529434204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3642645478248596, + "epoch": 6.6, + "learning_rate": 2.8438594372660307e-06, + "loss": 0.4473, + "step": 7810, + "task_loss": 0.17252805829048157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3289703130722046, + "epoch": 6.6, + "learning_rate": 2.8378215191402005e-06, + "loss": 0.3849, + "step": 7811, + "task_loss": 0.9243664741516113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2548018991947174, + "epoch": 6.6, + "learning_rate": 2.83178360101437e-06, + "loss": 0.404, + "step": 7812, + "task_loss": 0.058791011571884155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3100758194923401, + "epoch": 6.6, + "learning_rate": 2.8257456828885403e-06, + "loss": 0.3332, + "step": 7813, + "task_loss": 0.20684626698493958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37839335203170776, + "epoch": 6.6, + "learning_rate": 2.81970776476271e-06, + "loss": 0.3993, + "step": 7814, + "task_loss": 0.29184335470199585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4549984037876129, + "epoch": 6.61, + "learning_rate": 2.8136698466368798e-06, + "loss": 0.4734, + "step": 7815, + "task_loss": 0.6964058876037598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20752930641174316, + "epoch": 6.61, + "learning_rate": 2.8076319285110495e-06, + "loss": 0.2656, + "step": 7816, + "task_loss": 0.43779420852661133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38366836309432983, + "epoch": 6.61, + "learning_rate": 2.8015940103852192e-06, + "loss": 0.4193, + "step": 7817, + "task_loss": 0.9183304309844971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5784552693367004, + "epoch": 6.61, + "learning_rate": 2.7955560922593894e-06, + "loss": 0.4785, + "step": 7818, + "task_loss": 0.6070873737335205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5086684226989746, + "epoch": 6.61, + "learning_rate": 2.789518174133559e-06, + "loss": 0.402, + "step": 7819, + "task_loss": 0.9996541738510132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48842471837997437, + "epoch": 6.61, + "learning_rate": 2.783480256007729e-06, + "loss": 0.407, + "step": 7820, + "task_loss": 1.2925785779953003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4267001748085022, + "epoch": 6.61, + "learning_rate": 2.7774423378818985e-06, + "loss": 0.3581, + "step": 7821, + "task_loss": 1.0676548480987549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4967702329158783, + "epoch": 6.61, + "learning_rate": 2.7714044197560683e-06, + "loss": 0.468, + "step": 7822, + "task_loss": 0.5883840322494507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4273930788040161, + "epoch": 6.61, + "learning_rate": 2.765366501630238e-06, + "loss": 0.5187, + "step": 7823, + "task_loss": 0.7461974620819092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3583569824695587, + "epoch": 6.61, + "learning_rate": 2.7593285835044077e-06, + "loss": 0.4721, + "step": 7824, + "task_loss": 0.7887017726898193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2187284231185913, + "epoch": 6.61, + "learning_rate": 2.7532906653785774e-06, + "loss": 0.2664, + "step": 7825, + "task_loss": 0.10962007939815521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.310714453458786, + "epoch": 6.61, + "learning_rate": 2.747252747252747e-06, + "loss": 0.3914, + "step": 7826, + "task_loss": 0.8559110164642334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4884030520915985, + "epoch": 6.62, + "learning_rate": 2.741214829126917e-06, + "loss": 0.5416, + "step": 7827, + "task_loss": 1.5918447971343994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.431973934173584, + "epoch": 6.62, + "learning_rate": 2.735176911001087e-06, + "loss": 0.5168, + "step": 7828, + "task_loss": 1.1161606311798096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2481343150138855, + "epoch": 6.62, + "learning_rate": 2.7291389928752568e-06, + "loss": 0.3986, + "step": 7829, + "task_loss": 0.8261445760726929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5061767101287842, + "epoch": 6.62, + "learning_rate": 2.7231010747494265e-06, + "loss": 0.6034, + "step": 7830, + "task_loss": 0.3796250522136688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4761250615119934, + "epoch": 6.62, + "learning_rate": 2.717063156623596e-06, + "loss": 0.4086, + "step": 7831, + "task_loss": 0.6396211385726929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4305231273174286, + "epoch": 6.62, + "learning_rate": 2.711025238497766e-06, + "loss": 0.3261, + "step": 7832, + "task_loss": 0.29886797070503235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.206290140748024, + "epoch": 6.62, + "learning_rate": 2.7049873203719357e-06, + "loss": 0.3853, + "step": 7833, + "task_loss": 0.7150135636329651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4795604348182678, + "epoch": 6.62, + "learning_rate": 2.698949402246106e-06, + "loss": 0.4541, + "step": 7834, + "task_loss": 0.9646514058113098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3181604743003845, + "epoch": 6.62, + "learning_rate": 2.6929114841202755e-06, + "loss": 0.3892, + "step": 7835, + "task_loss": 0.6452935934066772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.296188086271286, + "epoch": 6.62, + "learning_rate": 2.6868735659944453e-06, + "loss": 0.5251, + "step": 7836, + "task_loss": 1.1955130100250244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23387813568115234, + "epoch": 6.62, + "learning_rate": 2.680835647868615e-06, + "loss": 0.3997, + "step": 7837, + "task_loss": 0.237611785531044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4236043691635132, + "epoch": 6.63, + "learning_rate": 2.6747977297427847e-06, + "loss": 0.3912, + "step": 7838, + "task_loss": 1.132907509803772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35488858819007874, + "epoch": 6.63, + "learning_rate": 2.668759811616955e-06, + "loss": 0.4736, + "step": 7839, + "task_loss": 1.3407732248306274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3824750781059265, + "epoch": 6.63, + "learning_rate": 2.6627218934911246e-06, + "loss": 0.4276, + "step": 7840, + "task_loss": 1.432985782623291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31375280022621155, + "epoch": 6.63, + "learning_rate": 2.6566839753652943e-06, + "loss": 0.4523, + "step": 7841, + "task_loss": 0.029650317505002022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37818723917007446, + "epoch": 6.63, + "learning_rate": 2.650646057239464e-06, + "loss": 0.5582, + "step": 7842, + "task_loss": 0.3610091805458069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39065325260162354, + "epoch": 6.63, + "learning_rate": 2.6446081391136337e-06, + "loss": 0.4625, + "step": 7843, + "task_loss": 0.6606209874153137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3053005039691925, + "epoch": 6.63, + "learning_rate": 2.638570220987804e-06, + "loss": 0.2674, + "step": 7844, + "task_loss": 0.44320106506347656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3128683865070343, + "epoch": 6.63, + "learning_rate": 2.632532302861973e-06, + "loss": 0.4007, + "step": 7845, + "task_loss": 0.9063977003097534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5638011693954468, + "epoch": 6.63, + "learning_rate": 2.626494384736143e-06, + "loss": 0.5071, + "step": 7846, + "task_loss": 1.4544475078582764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40162816643714905, + "epoch": 6.63, + "learning_rate": 2.6204564666103126e-06, + "loss": 0.4243, + "step": 7847, + "task_loss": 0.8912254571914673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22892434895038605, + "epoch": 6.63, + "learning_rate": 2.6144185484844824e-06, + "loss": 0.277, + "step": 7848, + "task_loss": 0.45351681113243103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1996748000383377, + "epoch": 6.63, + "learning_rate": 2.6083806303586525e-06, + "loss": 0.4053, + "step": 7849, + "task_loss": 0.5147760510444641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46449971199035645, + "epoch": 6.64, + "learning_rate": 2.6023427122328222e-06, + "loss": 0.3243, + "step": 7850, + "task_loss": 1.0577633380889893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8068167567253113, + "epoch": 6.64, + "learning_rate": 2.596304794106992e-06, + "loss": 0.4757, + "step": 7851, + "task_loss": 0.6515446305274963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7462450265884399, + "epoch": 6.64, + "learning_rate": 2.5902668759811617e-06, + "loss": 0.4472, + "step": 7852, + "task_loss": 0.7469035983085632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41302064061164856, + "epoch": 6.64, + "learning_rate": 2.5842289578553314e-06, + "loss": 0.4616, + "step": 7853, + "task_loss": 0.4949371814727783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20025980472564697, + "epoch": 6.64, + "learning_rate": 2.578191039729501e-06, + "loss": 0.4748, + "step": 7854, + "task_loss": 0.035653483122587204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5265182256698608, + "epoch": 6.64, + "learning_rate": 2.5721531216036713e-06, + "loss": 0.3891, + "step": 7855, + "task_loss": 0.9856874942779541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36393970251083374, + "epoch": 6.64, + "learning_rate": 2.566115203477841e-06, + "loss": 0.5077, + "step": 7856, + "task_loss": 0.6945651173591614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2998431324958801, + "epoch": 6.64, + "learning_rate": 2.5600772853520107e-06, + "loss": 0.3087, + "step": 7857, + "task_loss": 0.4087787866592407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3158137798309326, + "epoch": 6.64, + "learning_rate": 2.5540393672261805e-06, + "loss": 0.3701, + "step": 7858, + "task_loss": 0.21478907763957977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3998490571975708, + "epoch": 6.64, + "learning_rate": 2.54800144910035e-06, + "loss": 0.3526, + "step": 7859, + "task_loss": 0.6403995156288147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4730547368526459, + "epoch": 6.64, + "learning_rate": 2.5419635309745203e-06, + "loss": 0.3591, + "step": 7860, + "task_loss": 0.8645583391189575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23492273688316345, + "epoch": 6.64, + "learning_rate": 2.53592561284869e-06, + "loss": 0.3927, + "step": 7861, + "task_loss": 0.595885694026947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32106345891952515, + "epoch": 6.65, + "learning_rate": 2.5298876947228598e-06, + "loss": 0.3829, + "step": 7862, + "task_loss": 0.16185112297534943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2626842260360718, + "epoch": 6.65, + "learning_rate": 2.5238497765970295e-06, + "loss": 0.4081, + "step": 7863, + "task_loss": 0.8436007499694824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31590473651885986, + "epoch": 6.65, + "learning_rate": 2.5178118584711992e-06, + "loss": 0.4463, + "step": 7864, + "task_loss": 0.24911530315876007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6417696475982666, + "epoch": 6.65, + "learning_rate": 2.5117739403453694e-06, + "loss": 0.5727, + "step": 7865, + "task_loss": 1.059611201286316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4639134407043457, + "epoch": 6.65, + "learning_rate": 2.505736022219539e-06, + "loss": 0.4842, + "step": 7866, + "task_loss": 0.3415990471839905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3123406767845154, + "epoch": 6.65, + "learning_rate": 2.499698104093709e-06, + "loss": 0.3374, + "step": 7867, + "task_loss": 0.40217944979667664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26490211486816406, + "epoch": 6.65, + "learning_rate": 2.4936601859678785e-06, + "loss": 0.3807, + "step": 7868, + "task_loss": 0.26621001958847046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2414810061454773, + "epoch": 6.65, + "learning_rate": 2.4876222678420483e-06, + "loss": 0.409, + "step": 7869, + "task_loss": 0.32589191198349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23923486471176147, + "epoch": 6.65, + "learning_rate": 2.481584349716218e-06, + "loss": 0.3267, + "step": 7870, + "task_loss": 0.6938450932502747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8443307280540466, + "epoch": 6.65, + "learning_rate": 2.4755464315903877e-06, + "loss": 0.4717, + "step": 7871, + "task_loss": 1.4285906553268433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1790551245212555, + "epoch": 6.65, + "learning_rate": 2.4695085134645574e-06, + "loss": 0.4403, + "step": 7872, + "task_loss": 0.43923547863960266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5515785813331604, + "epoch": 6.65, + "learning_rate": 2.463470595338727e-06, + "loss": 0.4053, + "step": 7873, + "task_loss": 0.7410345673561096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2507975399494171, + "epoch": 6.66, + "learning_rate": 2.457432677212897e-06, + "loss": 0.3361, + "step": 7874, + "task_loss": 0.20648400485515594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42468440532684326, + "epoch": 6.66, + "learning_rate": 2.4513947590870666e-06, + "loss": 0.5302, + "step": 7875, + "task_loss": 1.0519031286239624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4347517192363739, + "epoch": 6.66, + "learning_rate": 2.4453568409612368e-06, + "loss": 0.4273, + "step": 7876, + "task_loss": 0.12205415219068527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26051557064056396, + "epoch": 6.66, + "learning_rate": 2.4393189228354065e-06, + "loss": 0.3449, + "step": 7877, + "task_loss": 0.865318775177002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.411740779876709, + "epoch": 6.66, + "learning_rate": 2.433281004709576e-06, + "loss": 0.3478, + "step": 7878, + "task_loss": 0.42227035760879517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3249126970767975, + "epoch": 6.66, + "learning_rate": 2.427243086583746e-06, + "loss": 0.4593, + "step": 7879, + "task_loss": 0.3463933765888214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20440863072872162, + "epoch": 6.66, + "learning_rate": 2.4212051684579157e-06, + "loss": 0.3155, + "step": 7880, + "task_loss": 0.3279677629470825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3153698146343231, + "epoch": 6.66, + "learning_rate": 2.415167250332086e-06, + "loss": 0.4577, + "step": 7881, + "task_loss": 1.3202075958251953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3663857579231262, + "epoch": 6.66, + "learning_rate": 2.4091293322062555e-06, + "loss": 0.4314, + "step": 7882, + "task_loss": 0.6951327323913574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2621501386165619, + "epoch": 6.66, + "learning_rate": 2.4030914140804253e-06, + "loss": 0.353, + "step": 7883, + "task_loss": 0.3960483968257904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49510759115219116, + "epoch": 6.66, + "learning_rate": 2.397053495954595e-06, + "loss": 0.3833, + "step": 7884, + "task_loss": 0.6981363892555237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3025045096874237, + "epoch": 6.66, + "learning_rate": 2.3910155778287647e-06, + "loss": 0.4039, + "step": 7885, + "task_loss": 0.36626124382019043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5742518901824951, + "epoch": 6.67, + "learning_rate": 2.384977659702935e-06, + "loss": 0.4343, + "step": 7886, + "task_loss": 0.556591808795929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5511546730995178, + "epoch": 6.67, + "learning_rate": 2.3789397415771046e-06, + "loss": 0.4491, + "step": 7887, + "task_loss": 1.404227614402771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.233842134475708, + "epoch": 6.67, + "learning_rate": 2.3729018234512743e-06, + "loss": 0.3665, + "step": 7888, + "task_loss": 0.615575909614563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5038156509399414, + "epoch": 6.67, + "learning_rate": 2.366863905325444e-06, + "loss": 0.4828, + "step": 7889, + "task_loss": 1.5380604267120361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41283005475997925, + "epoch": 6.67, + "learning_rate": 2.3608259871996137e-06, + "loss": 0.3422, + "step": 7890, + "task_loss": 0.675048291683197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.16099828481674194, + "epoch": 6.67, + "learning_rate": 2.3547880690737835e-06, + "loss": 0.32, + "step": 7891, + "task_loss": 0.20258113741874695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5823311805725098, + "epoch": 6.67, + "learning_rate": 2.348750150947953e-06, + "loss": 0.37, + "step": 7892, + "task_loss": 0.6660346388816833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.188491553068161, + "epoch": 6.67, + "learning_rate": 2.342712232822123e-06, + "loss": 0.3943, + "step": 7893, + "task_loss": 0.4211389124393463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47819140553474426, + "epoch": 6.67, + "learning_rate": 2.3366743146962926e-06, + "loss": 0.5128, + "step": 7894, + "task_loss": 1.080166220664978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2788325846195221, + "epoch": 6.67, + "learning_rate": 2.3306363965704624e-06, + "loss": 0.3076, + "step": 7895, + "task_loss": 0.12697818875312805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.16821561753749847, + "epoch": 6.67, + "learning_rate": 2.324598478444632e-06, + "loss": 0.3315, + "step": 7896, + "task_loss": 0.5368017554283142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4878840446472168, + "epoch": 6.67, + "learning_rate": 2.3185605603188022e-06, + "loss": 0.4428, + "step": 7897, + "task_loss": 1.1241984367370605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22706350684165955, + "epoch": 6.68, + "learning_rate": 2.312522642192972e-06, + "loss": 0.4323, + "step": 7898, + "task_loss": 0.4268385171890259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25211775302886963, + "epoch": 6.68, + "learning_rate": 2.3064847240671417e-06, + "loss": 0.3841, + "step": 7899, + "task_loss": 0.47959211468696594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33337458968162537, + "epoch": 6.68, + "learning_rate": 2.3004468059413114e-06, + "loss": 0.5069, + "step": 7900, + "task_loss": 0.19119007885456085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30133315920829773, + "epoch": 6.68, + "learning_rate": 2.294408887815481e-06, + "loss": 0.4271, + "step": 7901, + "task_loss": 0.8916338682174683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37784767150878906, + "epoch": 6.68, + "learning_rate": 2.2883709696896513e-06, + "loss": 0.4181, + "step": 7902, + "task_loss": 0.9826608896255493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26956483721733093, + "epoch": 6.68, + "learning_rate": 2.282333051563821e-06, + "loss": 0.3216, + "step": 7903, + "task_loss": 0.62283855676651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24518194794654846, + "epoch": 6.68, + "learning_rate": 2.2762951334379907e-06, + "loss": 0.4202, + "step": 7904, + "task_loss": 1.2418737411499023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23428012430667877, + "epoch": 6.68, + "learning_rate": 2.2702572153121605e-06, + "loss": 0.4165, + "step": 7905, + "task_loss": 0.29500505328178406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5334722995758057, + "epoch": 6.68, + "learning_rate": 2.26421929718633e-06, + "loss": 0.4945, + "step": 7906, + "task_loss": 0.3435673117637634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.326000452041626, + "epoch": 6.68, + "learning_rate": 2.2581813790605003e-06, + "loss": 0.4026, + "step": 7907, + "task_loss": 0.2820214033126831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46510452032089233, + "epoch": 6.68, + "learning_rate": 2.25214346093467e-06, + "loss": 0.4793, + "step": 7908, + "task_loss": 0.37099313735961914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.16066010296344757, + "epoch": 6.69, + "learning_rate": 2.2461055428088398e-06, + "loss": 0.359, + "step": 7909, + "task_loss": 0.028501689434051514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6241548657417297, + "epoch": 6.69, + "learning_rate": 2.2400676246830095e-06, + "loss": 0.3806, + "step": 7910, + "task_loss": 1.0853996276855469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25910311937332153, + "epoch": 6.69, + "learning_rate": 2.2340297065571792e-06, + "loss": 0.3262, + "step": 7911, + "task_loss": 0.05178667977452278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32086479663848877, + "epoch": 6.69, + "learning_rate": 2.227991788431349e-06, + "loss": 0.3427, + "step": 7912, + "task_loss": 0.4692910611629486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2468743473291397, + "epoch": 6.69, + "learning_rate": 2.2219538703055187e-06, + "loss": 0.3925, + "step": 7913, + "task_loss": 0.7566897869110107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5782186985015869, + "epoch": 6.69, + "learning_rate": 2.2159159521796884e-06, + "loss": 0.447, + "step": 7914, + "task_loss": 0.6278772950172424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8625306487083435, + "epoch": 6.69, + "learning_rate": 2.209878034053858e-06, + "loss": 0.524, + "step": 7915, + "task_loss": 1.9267921447753906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26600950956344604, + "epoch": 6.69, + "learning_rate": 2.203840115928028e-06, + "loss": 0.3964, + "step": 7916, + "task_loss": 0.30941513180732727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40066784620285034, + "epoch": 6.69, + "learning_rate": 2.197802197802198e-06, + "loss": 0.4589, + "step": 7917, + "task_loss": 0.3184880018234253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28865593671798706, + "epoch": 6.69, + "learning_rate": 2.1917642796763677e-06, + "loss": 0.4239, + "step": 7918, + "task_loss": 0.045572374016046524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.557461678981781, + "epoch": 6.69, + "learning_rate": 2.1857263615505374e-06, + "loss": 0.3945, + "step": 7919, + "task_loss": 0.2711033225059509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.289492666721344, + "epoch": 6.69, + "learning_rate": 2.179688443424707e-06, + "loss": 0.3761, + "step": 7920, + "task_loss": 0.39926058053970337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5150561928749084, + "epoch": 6.7, + "learning_rate": 2.173650525298877e-06, + "loss": 0.4371, + "step": 7921, + "task_loss": 0.9533637762069702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.515261173248291, + "epoch": 6.7, + "learning_rate": 2.1676126071730466e-06, + "loss": 0.5923, + "step": 7922, + "task_loss": 0.43281084299087524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7919328808784485, + "epoch": 6.7, + "learning_rate": 2.1615746890472168e-06, + "loss": 0.5064, + "step": 7923, + "task_loss": 0.4821922183036804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30817073583602905, + "epoch": 6.7, + "learning_rate": 2.1555367709213865e-06, + "loss": 0.3927, + "step": 7924, + "task_loss": 0.4096126854419708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30237463116645813, + "epoch": 6.7, + "learning_rate": 2.149498852795556e-06, + "loss": 0.442, + "step": 7925, + "task_loss": 0.5884860157966614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3515297770500183, + "epoch": 6.7, + "learning_rate": 2.143460934669726e-06, + "loss": 0.3581, + "step": 7926, + "task_loss": 0.3662824034690857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4089769423007965, + "epoch": 6.7, + "learning_rate": 2.1374230165438957e-06, + "loss": 0.3159, + "step": 7927, + "task_loss": 0.5543223023414612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5224348306655884, + "epoch": 6.7, + "learning_rate": 2.131385098418066e-06, + "loss": 0.4309, + "step": 7928, + "task_loss": 0.9393535256385803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33549395203590393, + "epoch": 6.7, + "learning_rate": 2.1253471802922355e-06, + "loss": 0.3378, + "step": 7929, + "task_loss": 0.2982596755027771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5055288076400757, + "epoch": 6.7, + "learning_rate": 2.1193092621664052e-06, + "loss": 0.468, + "step": 7930, + "task_loss": 0.7529047727584839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24795804917812347, + "epoch": 6.7, + "learning_rate": 2.113271344040575e-06, + "loss": 0.4143, + "step": 7931, + "task_loss": 0.13124839961528778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2998420298099518, + "epoch": 6.7, + "learning_rate": 2.1072334259147447e-06, + "loss": 0.4433, + "step": 7932, + "task_loss": 0.2126983106136322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27831244468688965, + "epoch": 6.71, + "learning_rate": 2.101195507788915e-06, + "loss": 0.361, + "step": 7933, + "task_loss": 0.8542847633361816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3746930658817291, + "epoch": 6.71, + "learning_rate": 2.0951575896630846e-06, + "loss": 0.385, + "step": 7934, + "task_loss": 0.4437871277332306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5596070885658264, + "epoch": 6.71, + "learning_rate": 2.089119671537254e-06, + "loss": 0.4489, + "step": 7935, + "task_loss": 0.7102393507957458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3555496335029602, + "epoch": 6.71, + "learning_rate": 2.0830817534114236e-06, + "loss": 0.4421, + "step": 7936, + "task_loss": 0.76910400390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3050389587879181, + "epoch": 6.71, + "learning_rate": 2.0770438352855933e-06, + "loss": 0.3188, + "step": 7937, + "task_loss": 0.7956819534301758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24922674894332886, + "epoch": 6.71, + "learning_rate": 2.0710059171597635e-06, + "loss": 0.3186, + "step": 7938, + "task_loss": 0.176559180021286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4000927805900574, + "epoch": 6.71, + "learning_rate": 2.064967999033933e-06, + "loss": 0.3941, + "step": 7939, + "task_loss": 0.9128929972648621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5999336242675781, + "epoch": 6.71, + "learning_rate": 2.058930080908103e-06, + "loss": 0.5608, + "step": 7940, + "task_loss": 1.114283800125122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5452135801315308, + "epoch": 6.71, + "learning_rate": 2.0528921627822726e-06, + "loss": 0.4706, + "step": 7941, + "task_loss": 0.6370884776115417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3479295074939728, + "epoch": 6.71, + "learning_rate": 2.0468542446564424e-06, + "loss": 0.2823, + "step": 7942, + "task_loss": 0.42150014638900757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5642030239105225, + "epoch": 6.71, + "learning_rate": 2.040816326530612e-06, + "loss": 0.3913, + "step": 7943, + "task_loss": 0.7624131441116333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2225511074066162, + "epoch": 6.71, + "learning_rate": 2.0347784084047822e-06, + "loss": 0.3364, + "step": 7944, + "task_loss": 0.4228188693523407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2807016372680664, + "epoch": 6.72, + "learning_rate": 2.028740490278952e-06, + "loss": 0.4071, + "step": 7945, + "task_loss": 0.8647692799568176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38694629073143005, + "epoch": 6.72, + "learning_rate": 2.0227025721531217e-06, + "loss": 0.3648, + "step": 7946, + "task_loss": 1.0911650657653809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1855526566505432, + "epoch": 6.72, + "learning_rate": 2.0166646540272914e-06, + "loss": 0.3738, + "step": 7947, + "task_loss": 0.3385108411312103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5334384441375732, + "epoch": 6.72, + "learning_rate": 2.010626735901461e-06, + "loss": 0.4576, + "step": 7948, + "task_loss": 1.1585549116134644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22523944079875946, + "epoch": 6.72, + "learning_rate": 2.0045888177756313e-06, + "loss": 0.2361, + "step": 7949, + "task_loss": 0.5896918773651123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48542869091033936, + "epoch": 6.72, + "learning_rate": 1.998550899649801e-06, + "loss": 0.4754, + "step": 7950, + "task_loss": 0.5891206860542297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5120091438293457, + "epoch": 6.72, + "learning_rate": 1.9925129815239707e-06, + "loss": 0.4674, + "step": 7951, + "task_loss": 0.7120363712310791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7795814275741577, + "epoch": 6.72, + "learning_rate": 1.9864750633981404e-06, + "loss": 0.4468, + "step": 7952, + "task_loss": 0.713590145111084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3642672598361969, + "epoch": 6.72, + "learning_rate": 1.98043714527231e-06, + "loss": 0.467, + "step": 7953, + "task_loss": 0.0375012643635273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2871781289577484, + "epoch": 6.72, + "learning_rate": 1.9743992271464803e-06, + "loss": 0.3489, + "step": 7954, + "task_loss": 0.506109356880188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37487345933914185, + "epoch": 6.72, + "learning_rate": 1.96836130902065e-06, + "loss": 0.4275, + "step": 7955, + "task_loss": 0.8948546648025513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3117806017398834, + "epoch": 6.72, + "learning_rate": 1.9623233908948198e-06, + "loss": 0.4144, + "step": 7956, + "task_loss": 0.572005569934845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3016677498817444, + "epoch": 6.73, + "learning_rate": 1.9562854727689895e-06, + "loss": 0.4523, + "step": 7957, + "task_loss": 0.22136709094047546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43804067373275757, + "epoch": 6.73, + "learning_rate": 1.9502475546431592e-06, + "loss": 0.3946, + "step": 7958, + "task_loss": 0.820746123790741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6419487595558167, + "epoch": 6.73, + "learning_rate": 1.944209636517329e-06, + "loss": 0.3734, + "step": 7959, + "task_loss": 0.8261976838111877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4066256880760193, + "epoch": 6.73, + "learning_rate": 1.9381717183914987e-06, + "loss": 0.3644, + "step": 7960, + "task_loss": 0.26412928104400635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40790605545043945, + "epoch": 6.73, + "learning_rate": 1.9321338002656684e-06, + "loss": 0.3824, + "step": 7961, + "task_loss": 0.31495723128318787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3960433304309845, + "epoch": 6.73, + "learning_rate": 1.926095882139838e-06, + "loss": 0.5451, + "step": 7962, + "task_loss": 0.10692081600427628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28375244140625, + "epoch": 6.73, + "learning_rate": 1.920057964014008e-06, + "loss": 0.2544, + "step": 7963, + "task_loss": 0.316921591758728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5003267526626587, + "epoch": 6.73, + "learning_rate": 1.9140200458881776e-06, + "loss": 0.3621, + "step": 7964, + "task_loss": 0.12720170617103577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.14777764678001404, + "epoch": 6.73, + "learning_rate": 1.9079821277623477e-06, + "loss": 0.3467, + "step": 7965, + "task_loss": 0.2674954831600189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2301875352859497, + "epoch": 6.73, + "learning_rate": 1.9019442096365174e-06, + "loss": 0.4033, + "step": 7966, + "task_loss": 0.3096529543399811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44599300622940063, + "epoch": 6.73, + "learning_rate": 1.8959062915106872e-06, + "loss": 0.3165, + "step": 7967, + "task_loss": 0.6506139636039734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39861592650413513, + "epoch": 6.73, + "learning_rate": 1.8898683733848569e-06, + "loss": 0.545, + "step": 7968, + "task_loss": 1.3597209453582764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43565884232521057, + "epoch": 6.74, + "learning_rate": 1.8838304552590266e-06, + "loss": 0.3441, + "step": 7969, + "task_loss": 0.23760254681110382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5121496915817261, + "epoch": 6.74, + "learning_rate": 1.8777925371331968e-06, + "loss": 0.5822, + "step": 7970, + "task_loss": 1.17240309715271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3862224519252777, + "epoch": 6.74, + "learning_rate": 1.8717546190073665e-06, + "loss": 0.4281, + "step": 7971, + "task_loss": 0.5243268013000488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29804879426956177, + "epoch": 6.74, + "learning_rate": 1.8657167008815362e-06, + "loss": 0.4381, + "step": 7972, + "task_loss": 0.389913946390152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40790337324142456, + "epoch": 6.74, + "learning_rate": 1.859678782755706e-06, + "loss": 0.3972, + "step": 7973, + "task_loss": 0.9472525119781494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4372307062149048, + "epoch": 6.74, + "learning_rate": 1.8536408646298756e-06, + "loss": 0.5798, + "step": 7974, + "task_loss": 0.7561575770378113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42009615898132324, + "epoch": 6.74, + "learning_rate": 1.8476029465040456e-06, + "loss": 0.5174, + "step": 7975, + "task_loss": 0.16192148625850677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30770349502563477, + "epoch": 6.74, + "learning_rate": 1.8415650283782153e-06, + "loss": 0.3667, + "step": 7976, + "task_loss": 0.5296329855918884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29661431908607483, + "epoch": 6.74, + "learning_rate": 1.835527110252385e-06, + "loss": 0.4037, + "step": 7977, + "task_loss": 0.2183021605014801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5738179683685303, + "epoch": 6.74, + "learning_rate": 1.8294891921265548e-06, + "loss": 0.4403, + "step": 7978, + "task_loss": 0.5755680799484253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3108544945716858, + "epoch": 6.74, + "learning_rate": 1.8234512740007245e-06, + "loss": 0.5109, + "step": 7979, + "task_loss": 1.4009648561477661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42050036787986755, + "epoch": 6.75, + "learning_rate": 1.8174133558748946e-06, + "loss": 0.4314, + "step": 7980, + "task_loss": 0.6934286952018738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36310625076293945, + "epoch": 6.75, + "learning_rate": 1.8113754377490644e-06, + "loss": 0.4216, + "step": 7981, + "task_loss": 0.49055513739585876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4886122941970825, + "epoch": 6.75, + "learning_rate": 1.805337519623234e-06, + "loss": 0.4387, + "step": 7982, + "task_loss": 0.7535020112991333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.35759294033050537, + "epoch": 6.75, + "learning_rate": 1.7992996014974038e-06, + "loss": 0.5487, + "step": 7983, + "task_loss": 0.9826281666755676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33584505319595337, + "epoch": 6.75, + "learning_rate": 1.7932616833715735e-06, + "loss": 0.4077, + "step": 7984, + "task_loss": 0.16873866319656372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2501624524593353, + "epoch": 6.75, + "learning_rate": 1.7872237652457432e-06, + "loss": 0.4212, + "step": 7985, + "task_loss": 0.3186874985694885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2867826819419861, + "epoch": 6.75, + "learning_rate": 1.7811858471199132e-06, + "loss": 0.3232, + "step": 7986, + "task_loss": 0.4423218071460724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.18785551190376282, + "epoch": 6.75, + "learning_rate": 1.775147928994083e-06, + "loss": 0.4489, + "step": 7987, + "task_loss": 0.4902009665966034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5768045783042908, + "epoch": 6.75, + "learning_rate": 1.7691100108682526e-06, + "loss": 0.4623, + "step": 7988, + "task_loss": 0.5507065653800964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3763067126274109, + "epoch": 6.75, + "learning_rate": 1.7630720927424224e-06, + "loss": 0.4211, + "step": 7989, + "task_loss": 0.5409693121910095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44266730546951294, + "epoch": 6.75, + "learning_rate": 1.757034174616592e-06, + "loss": 0.4566, + "step": 7990, + "task_loss": 0.7320343852043152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4275152385234833, + "epoch": 6.75, + "learning_rate": 1.7509962564907622e-06, + "loss": 0.4532, + "step": 7991, + "task_loss": 0.37711501121520996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4945535361766815, + "epoch": 6.76, + "learning_rate": 1.744958338364932e-06, + "loss": 0.3783, + "step": 7992, + "task_loss": 0.5126373171806335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31805723905563354, + "epoch": 6.76, + "learning_rate": 1.7389204202391017e-06, + "loss": 0.3649, + "step": 7993, + "task_loss": 0.20909534394741058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1315464973449707, + "epoch": 6.76, + "learning_rate": 1.7328825021132714e-06, + "loss": 0.3118, + "step": 7994, + "task_loss": 0.01028872188180685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5303727388381958, + "epoch": 6.76, + "learning_rate": 1.7268445839874411e-06, + "loss": 0.3717, + "step": 7995, + "task_loss": 1.2639329433441162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4363662004470825, + "epoch": 6.76, + "learning_rate": 1.720806665861611e-06, + "loss": 0.3448, + "step": 7996, + "task_loss": 1.200205683708191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4677465558052063, + "epoch": 6.76, + "learning_rate": 1.7147687477357808e-06, + "loss": 0.4801, + "step": 7997, + "task_loss": 1.2612807750701904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6155959367752075, + "epoch": 6.76, + "learning_rate": 1.7087308296099505e-06, + "loss": 0.3818, + "step": 7998, + "task_loss": 0.9973390698432922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6081398725509644, + "epoch": 6.76, + "learning_rate": 1.7026929114841202e-06, + "loss": 0.4829, + "step": 7999, + "task_loss": 0.4856477975845337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3179943561553955, + "epoch": 6.76, + "learning_rate": 1.69665499335829e-06, + "loss": 0.452, + "step": 8000, + "task_loss": 0.6029680967330933 + }, + { + "epoch": 6.76, + "eval_accuracy": 0.9126732673267327, + "eval_loss": 0.26727527379989624, + "eval_runtime": 226.8414, + "eval_samples_per_second": 111.311, + "eval_steps_per_second": 0.873, + "step": 8000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36752182245254517, + "epoch": 6.76, + "learning_rate": 1.69061707523246e-06, + "loss": 0.5529, + "step": 8001, + "task_loss": 0.12076438218355179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37279212474823, + "epoch": 6.76, + "learning_rate": 1.6845791571066298e-06, + "loss": 0.3141, + "step": 8002, + "task_loss": 0.38453209400177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7577190399169922, + "epoch": 6.76, + "learning_rate": 1.6785412389807996e-06, + "loss": 0.5044, + "step": 8003, + "task_loss": 0.9730449318885803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2961850166320801, + "epoch": 6.77, + "learning_rate": 1.6725033208549693e-06, + "loss": 0.3412, + "step": 8004, + "task_loss": 0.5795434713363647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5098868608474731, + "epoch": 6.77, + "learning_rate": 1.666465402729139e-06, + "loss": 0.4377, + "step": 8005, + "task_loss": 1.1716830730438232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4118422865867615, + "epoch": 6.77, + "learning_rate": 1.6604274846033087e-06, + "loss": 0.5184, + "step": 8006, + "task_loss": 1.0460752248764038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21092408895492554, + "epoch": 6.77, + "learning_rate": 1.6543895664774787e-06, + "loss": 0.3966, + "step": 8007, + "task_loss": 0.5367329716682434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4523795247077942, + "epoch": 6.77, + "learning_rate": 1.6483516483516484e-06, + "loss": 0.3369, + "step": 8008, + "task_loss": 0.43254226446151733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3948954939842224, + "epoch": 6.77, + "learning_rate": 1.6423137302258181e-06, + "loss": 0.4104, + "step": 8009, + "task_loss": 0.3960741460323334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.30564191937446594, + "epoch": 6.77, + "learning_rate": 1.6362758120999878e-06, + "loss": 0.4818, + "step": 8010, + "task_loss": 0.3475039601325989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28640469908714294, + "epoch": 6.77, + "learning_rate": 1.6302378939741576e-06, + "loss": 0.4139, + "step": 8011, + "task_loss": 0.6030334830284119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45724889636039734, + "epoch": 6.77, + "learning_rate": 1.6241999758483277e-06, + "loss": 0.4217, + "step": 8012, + "task_loss": 1.0668278932571411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3608822822570801, + "epoch": 6.77, + "learning_rate": 1.6181620577224974e-06, + "loss": 0.6165, + "step": 8013, + "task_loss": 0.6803489923477173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4502766728401184, + "epoch": 6.77, + "learning_rate": 1.6121241395966672e-06, + "loss": 0.3513, + "step": 8014, + "task_loss": 0.8112562894821167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23649826645851135, + "epoch": 6.77, + "learning_rate": 1.6060862214708369e-06, + "loss": 0.4597, + "step": 8015, + "task_loss": 0.3836895525455475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.9106251001358032, + "epoch": 6.78, + "learning_rate": 1.6000483033450066e-06, + "loss": 0.4863, + "step": 8016, + "task_loss": 0.8019269704818726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5103771090507507, + "epoch": 6.78, + "learning_rate": 1.5940103852191767e-06, + "loss": 0.5116, + "step": 8017, + "task_loss": 0.42284879088401794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3947240114212036, + "epoch": 6.78, + "learning_rate": 1.5879724670933463e-06, + "loss": 0.383, + "step": 8018, + "task_loss": 1.3024839162826538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6466600298881531, + "epoch": 6.78, + "learning_rate": 1.581934548967516e-06, + "loss": 0.4831, + "step": 8019, + "task_loss": 0.44638004899024963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4026990830898285, + "epoch": 6.78, + "learning_rate": 1.5758966308416857e-06, + "loss": 0.5114, + "step": 8020, + "task_loss": 0.7961933612823486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5287365913391113, + "epoch": 6.78, + "learning_rate": 1.5698587127158554e-06, + "loss": 0.5876, + "step": 8021, + "task_loss": 0.8645148277282715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4717355966567993, + "epoch": 6.78, + "learning_rate": 1.5638207945900256e-06, + "loss": 0.5019, + "step": 8022, + "task_loss": 0.435039222240448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2992399334907532, + "epoch": 6.78, + "learning_rate": 1.557782876464195e-06, + "loss": 0.3725, + "step": 8023, + "task_loss": 0.14062321186065674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2429455816745758, + "epoch": 6.78, + "learning_rate": 1.551744958338365e-06, + "loss": 0.2931, + "step": 8024, + "task_loss": 0.5170424580574036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32316386699676514, + "epoch": 6.78, + "learning_rate": 1.5457070402125348e-06, + "loss": 0.4101, + "step": 8025, + "task_loss": 0.7490684390068054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31373485922813416, + "epoch": 6.78, + "learning_rate": 1.5396691220867047e-06, + "loss": 0.3198, + "step": 8026, + "task_loss": 0.4116826057434082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5458649396896362, + "epoch": 6.78, + "learning_rate": 1.5336312039608744e-06, + "loss": 0.4077, + "step": 8027, + "task_loss": 0.09386246651411057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29614460468292236, + "epoch": 6.79, + "learning_rate": 1.5275932858350441e-06, + "loss": 0.4112, + "step": 8028, + "task_loss": 0.5199512839317322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38036584854125977, + "epoch": 6.79, + "learning_rate": 1.521555367709214e-06, + "loss": 0.3372, + "step": 8029, + "task_loss": 0.9518170356750488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24982276558876038, + "epoch": 6.79, + "learning_rate": 1.5155174495833838e-06, + "loss": 0.3111, + "step": 8030, + "task_loss": 0.31936317682266235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38876450061798096, + "epoch": 6.79, + "learning_rate": 1.5094795314575535e-06, + "loss": 0.4135, + "step": 8031, + "task_loss": 0.4519073963165283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22202223539352417, + "epoch": 6.79, + "learning_rate": 1.5034416133317232e-06, + "loss": 0.3854, + "step": 8032, + "task_loss": 0.7422440648078918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27680790424346924, + "epoch": 6.79, + "learning_rate": 1.497403695205893e-06, + "loss": 0.4041, + "step": 8033, + "task_loss": 0.40240928530693054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2869625389575958, + "epoch": 6.79, + "learning_rate": 1.491365777080063e-06, + "loss": 0.3567, + "step": 8034, + "task_loss": 0.11391927301883698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22488252818584442, + "epoch": 6.79, + "learning_rate": 1.4853278589542326e-06, + "loss": 0.3181, + "step": 8035, + "task_loss": 0.6639074087142944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5034712553024292, + "epoch": 6.79, + "learning_rate": 1.4792899408284024e-06, + "loss": 0.5009, + "step": 8036, + "task_loss": 0.9051762819290161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4290744662284851, + "epoch": 6.79, + "learning_rate": 1.4732520227025723e-06, + "loss": 0.4564, + "step": 8037, + "task_loss": 0.4157385230064392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7050086259841919, + "epoch": 6.79, + "learning_rate": 1.467214104576742e-06, + "loss": 0.6109, + "step": 8038, + "task_loss": 0.4410955607891083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3011908531188965, + "epoch": 6.79, + "learning_rate": 1.461176186450912e-06, + "loss": 0.4057, + "step": 8039, + "task_loss": 0.934240996837616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4592078924179077, + "epoch": 6.8, + "learning_rate": 1.4551382683250817e-06, + "loss": 0.4102, + "step": 8040, + "task_loss": 0.4647703468799591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3786677122116089, + "epoch": 6.8, + "learning_rate": 1.4491003501992514e-06, + "loss": 0.4414, + "step": 8041, + "task_loss": 0.3575001060962677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2954862117767334, + "epoch": 6.8, + "learning_rate": 1.4430624320734211e-06, + "loss": 0.4506, + "step": 8042, + "task_loss": 0.46702656149864197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26899415254592896, + "epoch": 6.8, + "learning_rate": 1.4370245139475908e-06, + "loss": 0.3475, + "step": 8043, + "task_loss": 0.09749466925859451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3965744376182556, + "epoch": 6.8, + "learning_rate": 1.4309865958217608e-06, + "loss": 0.3893, + "step": 8044, + "task_loss": 0.43775674700737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31000733375549316, + "epoch": 6.8, + "learning_rate": 1.4249486776959305e-06, + "loss": 0.342, + "step": 8045, + "task_loss": 0.5138819813728333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5599683523178101, + "epoch": 6.8, + "learning_rate": 1.4189107595701002e-06, + "loss": 0.4317, + "step": 8046, + "task_loss": 0.7945274710655212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3637505769729614, + "epoch": 6.8, + "learning_rate": 1.4128728414442702e-06, + "loss": 0.3347, + "step": 8047, + "task_loss": 0.16970522701740265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4931928515434265, + "epoch": 6.8, + "learning_rate": 1.4068349233184399e-06, + "loss": 0.4541, + "step": 8048, + "task_loss": 0.2994762659072876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3583560585975647, + "epoch": 6.8, + "learning_rate": 1.4007970051926096e-06, + "loss": 0.3454, + "step": 8049, + "task_loss": 0.16286805272102356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2832902669906616, + "epoch": 6.8, + "learning_rate": 1.3947590870667795e-06, + "loss": 0.3906, + "step": 8050, + "task_loss": 0.6985470056533813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3191336393356323, + "epoch": 6.81, + "learning_rate": 1.3887211689409493e-06, + "loss": 0.308, + "step": 8051, + "task_loss": 0.2285272777080536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5420382618904114, + "epoch": 6.81, + "learning_rate": 1.382683250815119e-06, + "loss": 0.4478, + "step": 8052, + "task_loss": 0.9865618944168091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5874152183532715, + "epoch": 6.81, + "learning_rate": 1.3766453326892887e-06, + "loss": 0.3559, + "step": 8053, + "task_loss": 0.20289303362369537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23840884864330292, + "epoch": 6.81, + "learning_rate": 1.3706074145634584e-06, + "loss": 0.3416, + "step": 8054, + "task_loss": 0.36507004499435425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34686076641082764, + "epoch": 6.81, + "learning_rate": 1.3645694964376284e-06, + "loss": 0.3334, + "step": 8055, + "task_loss": 0.8852087259292603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23420071601867676, + "epoch": 6.81, + "learning_rate": 1.358531578311798e-06, + "loss": 0.3152, + "step": 8056, + "task_loss": 0.6890289187431335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5600628852844238, + "epoch": 6.81, + "learning_rate": 1.3524936601859678e-06, + "loss": 0.5092, + "step": 8057, + "task_loss": 1.6099399328231812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4027302861213684, + "epoch": 6.81, + "learning_rate": 1.3464557420601378e-06, + "loss": 0.3917, + "step": 8058, + "task_loss": 0.6434075236320496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3910123109817505, + "epoch": 6.81, + "learning_rate": 1.3404178239343075e-06, + "loss": 0.5619, + "step": 8059, + "task_loss": 0.2556496560573578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6062817573547363, + "epoch": 6.81, + "learning_rate": 1.3343799058084774e-06, + "loss": 0.5598, + "step": 8060, + "task_loss": 0.3176896870136261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31638848781585693, + "epoch": 6.81, + "learning_rate": 1.3283419876826471e-06, + "loss": 0.3816, + "step": 8061, + "task_loss": 0.24318234622478485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4265042543411255, + "epoch": 6.81, + "learning_rate": 1.3223040695568169e-06, + "loss": 0.4708, + "step": 8062, + "task_loss": 0.4662654995918274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37705564498901367, + "epoch": 6.82, + "learning_rate": 1.3162661514309866e-06, + "loss": 0.4495, + "step": 8063, + "task_loss": 0.5173128843307495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3848232328891754, + "epoch": 6.82, + "learning_rate": 1.3102282333051563e-06, + "loss": 0.325, + "step": 8064, + "task_loss": 0.4820360541343689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.516701340675354, + "epoch": 6.82, + "learning_rate": 1.3041903151793263e-06, + "loss": 0.4259, + "step": 8065, + "task_loss": 0.41348904371261597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44096285104751587, + "epoch": 6.82, + "learning_rate": 1.298152397053496e-06, + "loss": 0.3127, + "step": 8066, + "task_loss": 0.22364304959774017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3346620202064514, + "epoch": 6.82, + "learning_rate": 1.2921144789276657e-06, + "loss": 0.464, + "step": 8067, + "task_loss": 0.9475773572921753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4296698570251465, + "epoch": 6.82, + "learning_rate": 1.2860765608018356e-06, + "loss": 0.4578, + "step": 8068, + "task_loss": 1.1858670711517334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36888259649276733, + "epoch": 6.82, + "learning_rate": 1.2800386426760054e-06, + "loss": 0.4407, + "step": 8069, + "task_loss": 0.6773166060447693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4725002348423004, + "epoch": 6.82, + "learning_rate": 1.274000724550175e-06, + "loss": 0.5492, + "step": 8070, + "task_loss": 0.5123980045318604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6548303365707397, + "epoch": 6.82, + "learning_rate": 1.267962806424345e-06, + "loss": 0.5271, + "step": 8071, + "task_loss": 0.7471229434013367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4654304087162018, + "epoch": 6.82, + "learning_rate": 1.2619248882985147e-06, + "loss": 0.3401, + "step": 8072, + "task_loss": 0.46453535556793213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43890371918678284, + "epoch": 6.82, + "learning_rate": 1.2558869701726847e-06, + "loss": 0.3421, + "step": 8073, + "task_loss": 1.2718652486801147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4536915123462677, + "epoch": 6.82, + "learning_rate": 1.2498490520468544e-06, + "loss": 0.428, + "step": 8074, + "task_loss": 1.1045451164245605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.15460707247257233, + "epoch": 6.83, + "learning_rate": 1.2438111339210241e-06, + "loss": 0.3916, + "step": 8075, + "task_loss": 0.1299423724412918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3087708353996277, + "epoch": 6.83, + "learning_rate": 1.2377732157951939e-06, + "loss": 0.3365, + "step": 8076, + "task_loss": 0.21000158786773682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3338640630245209, + "epoch": 6.83, + "learning_rate": 1.2317352976693636e-06, + "loss": 0.3898, + "step": 8077, + "task_loss": 0.5585970282554626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.24771705269813538, + "epoch": 6.83, + "learning_rate": 1.2256973795435333e-06, + "loss": 0.3262, + "step": 8078, + "task_loss": 0.24024909734725952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29337120056152344, + "epoch": 6.83, + "learning_rate": 1.2196594614177032e-06, + "loss": 0.291, + "step": 8079, + "task_loss": 0.7006511092185974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4343748092651367, + "epoch": 6.83, + "learning_rate": 1.213621543291873e-06, + "loss": 0.424, + "step": 8080, + "task_loss": 1.0463024377822876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32036271691322327, + "epoch": 6.83, + "learning_rate": 1.207583625166043e-06, + "loss": 0.3982, + "step": 8081, + "task_loss": 0.6397793889045715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25133317708969116, + "epoch": 6.83, + "learning_rate": 1.2015457070402126e-06, + "loss": 0.4583, + "step": 8082, + "task_loss": 0.07132356613874435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3481224477291107, + "epoch": 6.83, + "learning_rate": 1.1955077889143823e-06, + "loss": 0.349, + "step": 8083, + "task_loss": 1.3154499530792236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3263162672519684, + "epoch": 6.83, + "learning_rate": 1.1894698707885523e-06, + "loss": 0.4042, + "step": 8084, + "task_loss": 0.8532785177230835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4737127125263214, + "epoch": 6.83, + "learning_rate": 1.183431952662722e-06, + "loss": 0.5398, + "step": 8085, + "task_loss": 0.5222520232200623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38231703639030457, + "epoch": 6.83, + "learning_rate": 1.1773940345368917e-06, + "loss": 0.3944, + "step": 8086, + "task_loss": 0.1516355574131012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5626338124275208, + "epoch": 6.84, + "learning_rate": 1.1713561164110615e-06, + "loss": 0.4744, + "step": 8087, + "task_loss": 0.3940061032772064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2515054941177368, + "epoch": 6.84, + "learning_rate": 1.1653181982852312e-06, + "loss": 0.3411, + "step": 8088, + "task_loss": 0.19618873298168182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5854541063308716, + "epoch": 6.84, + "learning_rate": 1.1592802801594011e-06, + "loss": 0.4218, + "step": 8089, + "task_loss": 0.4840226471424103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2881999909877777, + "epoch": 6.84, + "learning_rate": 1.1532423620335708e-06, + "loss": 0.3697, + "step": 8090, + "task_loss": 0.35623180866241455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4555911421775818, + "epoch": 6.84, + "learning_rate": 1.1472044439077406e-06, + "loss": 0.428, + "step": 8091, + "task_loss": 0.6215843558311462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3905564546585083, + "epoch": 6.84, + "learning_rate": 1.1411665257819105e-06, + "loss": 0.4456, + "step": 8092, + "task_loss": 1.0606939792633057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42606687545776367, + "epoch": 6.84, + "learning_rate": 1.1351286076560802e-06, + "loss": 0.4215, + "step": 8093, + "task_loss": 1.0449498891830444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.16538885235786438, + "epoch": 6.84, + "learning_rate": 1.1290906895302502e-06, + "loss": 0.3765, + "step": 8094, + "task_loss": 0.1329883337020874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5121326446533203, + "epoch": 6.84, + "learning_rate": 1.1230527714044199e-06, + "loss": 0.3908, + "step": 8095, + "task_loss": 0.37612733244895935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40790823101997375, + "epoch": 6.84, + "learning_rate": 1.1170148532785896e-06, + "loss": 0.3418, + "step": 8096, + "task_loss": 0.6417520642280579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.31515976786613464, + "epoch": 6.84, + "learning_rate": 1.1109769351527593e-06, + "loss": 0.3775, + "step": 8097, + "task_loss": 0.4525148272514343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3336635231971741, + "epoch": 6.84, + "learning_rate": 1.104939017026929e-06, + "loss": 0.4608, + "step": 8098, + "task_loss": 0.6153085231781006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2598334848880768, + "epoch": 6.85, + "learning_rate": 1.098901098901099e-06, + "loss": 0.3469, + "step": 8099, + "task_loss": 0.24573233723640442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6087498068809509, + "epoch": 6.85, + "learning_rate": 1.0928631807752687e-06, + "loss": 0.5024, + "step": 8100, + "task_loss": 0.431217223405838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.331132173538208, + "epoch": 6.85, + "learning_rate": 1.0868252626494384e-06, + "loss": 0.4748, + "step": 8101, + "task_loss": 0.6218997836112976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34530937671661377, + "epoch": 6.85, + "learning_rate": 1.0807873445236084e-06, + "loss": 0.4111, + "step": 8102, + "task_loss": 0.14589296281337738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5008348822593689, + "epoch": 6.85, + "learning_rate": 1.074749426397778e-06, + "loss": 0.4975, + "step": 8103, + "task_loss": 0.5618547201156616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.276627779006958, + "epoch": 6.85, + "learning_rate": 1.0687115082719478e-06, + "loss": 0.3837, + "step": 8104, + "task_loss": 0.20287364721298218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3788642883300781, + "epoch": 6.85, + "learning_rate": 1.0626735901461178e-06, + "loss": 0.3701, + "step": 8105, + "task_loss": 0.4977553188800812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4012174904346466, + "epoch": 6.85, + "learning_rate": 1.0566356720202875e-06, + "loss": 0.3504, + "step": 8106, + "task_loss": 0.8576327562332153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3681253492832184, + "epoch": 6.85, + "learning_rate": 1.0505977538944574e-06, + "loss": 0.3502, + "step": 8107, + "task_loss": 2.7194690704345703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.27520355582237244, + "epoch": 6.85, + "learning_rate": 1.044559835768627e-06, + "loss": 0.4021, + "step": 8108, + "task_loss": 0.5127708315849304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43381834030151367, + "epoch": 6.85, + "learning_rate": 1.0385219176427967e-06, + "loss": 0.4592, + "step": 8109, + "task_loss": 0.41394490003585815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34065431356430054, + "epoch": 6.85, + "learning_rate": 1.0324839995169666e-06, + "loss": 0.4351, + "step": 8110, + "task_loss": 0.9003871083259583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41222646832466125, + "epoch": 6.86, + "learning_rate": 1.0264460813911363e-06, + "loss": 0.3657, + "step": 8111, + "task_loss": 0.9168933033943176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.18176651000976562, + "epoch": 6.86, + "learning_rate": 1.020408163265306e-06, + "loss": 0.2647, + "step": 8112, + "task_loss": 0.2734471559524536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28752341866493225, + "epoch": 6.86, + "learning_rate": 1.014370245139476e-06, + "loss": 0.3991, + "step": 8113, + "task_loss": 0.5798923373222351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.287160187959671, + "epoch": 6.86, + "learning_rate": 1.0083323270136457e-06, + "loss": 0.4789, + "step": 8114, + "task_loss": 0.6882603764533997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5609266757965088, + "epoch": 6.86, + "learning_rate": 1.0022944088878156e-06, + "loss": 0.4088, + "step": 8115, + "task_loss": 0.9643489122390747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39026910066604614, + "epoch": 6.86, + "learning_rate": 9.962564907619854e-07, + "loss": 0.4199, + "step": 8116, + "task_loss": 0.1604255735874176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3149331212043762, + "epoch": 6.86, + "learning_rate": 9.90218572636155e-07, + "loss": 0.3534, + "step": 8117, + "task_loss": 0.6909155249595642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.411400705575943, + "epoch": 6.86, + "learning_rate": 9.84180654510325e-07, + "loss": 0.2929, + "step": 8118, + "task_loss": 0.8703902959823608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43736398220062256, + "epoch": 6.86, + "learning_rate": 9.781427363844947e-07, + "loss": 0.4473, + "step": 8119, + "task_loss": 1.1147948503494263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32897982001304626, + "epoch": 6.86, + "learning_rate": 9.721048182586645e-07, + "loss": 0.5436, + "step": 8120, + "task_loss": 0.6354244947433472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6896844506263733, + "epoch": 6.86, + "learning_rate": 9.660669001328342e-07, + "loss": 0.4425, + "step": 8121, + "task_loss": 0.4681882858276367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43088117241859436, + "epoch": 6.87, + "learning_rate": 9.60028982007004e-07, + "loss": 0.4394, + "step": 8122, + "task_loss": 0.5679709315299988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37356817722320557, + "epoch": 6.87, + "learning_rate": 9.539910638811739e-07, + "loss": 0.3776, + "step": 8123, + "task_loss": 0.4733153283596039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.48914074897766113, + "epoch": 6.87, + "learning_rate": 9.479531457553436e-07, + "loss": 0.439, + "step": 8124, + "task_loss": 0.8664729595184326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4317038655281067, + "epoch": 6.87, + "learning_rate": 9.419152276295133e-07, + "loss": 0.394, + "step": 8125, + "task_loss": 0.9643426537513733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33526235818862915, + "epoch": 6.87, + "learning_rate": 9.358773095036832e-07, + "loss": 0.409, + "step": 8126, + "task_loss": 0.9616798758506775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5133476257324219, + "epoch": 6.87, + "learning_rate": 9.29839391377853e-07, + "loss": 0.4956, + "step": 8127, + "task_loss": 0.8181787133216858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25340792536735535, + "epoch": 6.87, + "learning_rate": 9.238014732520228e-07, + "loss": 0.3446, + "step": 8128, + "task_loss": 0.19762665033340454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4637390971183777, + "epoch": 6.87, + "learning_rate": 9.177635551261925e-07, + "loss": 0.4314, + "step": 8129, + "task_loss": 0.6076276302337646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.357305645942688, + "epoch": 6.87, + "learning_rate": 9.117256370003622e-07, + "loss": 0.3837, + "step": 8130, + "task_loss": 0.07775256037712097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2515905499458313, + "epoch": 6.87, + "learning_rate": 9.056877188745322e-07, + "loss": 0.3471, + "step": 8131, + "task_loss": 0.33892595767974854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.34571531414985657, + "epoch": 6.87, + "learning_rate": 8.996498007487019e-07, + "loss": 0.4797, + "step": 8132, + "task_loss": 0.7883678674697876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5074376463890076, + "epoch": 6.87, + "learning_rate": 8.936118826228716e-07, + "loss": 0.4433, + "step": 8133, + "task_loss": 0.3056316077709198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3160848319530487, + "epoch": 6.88, + "learning_rate": 8.875739644970415e-07, + "loss": 0.4255, + "step": 8134, + "task_loss": 0.7935640215873718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45918622612953186, + "epoch": 6.88, + "learning_rate": 8.815360463712112e-07, + "loss": 0.4804, + "step": 8135, + "task_loss": 0.5965117812156677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4346823990345001, + "epoch": 6.88, + "learning_rate": 8.754981282453811e-07, + "loss": 0.3842, + "step": 8136, + "task_loss": 0.3502546548843384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6441003680229187, + "epoch": 6.88, + "learning_rate": 8.694602101195508e-07, + "loss": 0.4987, + "step": 8137, + "task_loss": 0.683269739151001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5230835676193237, + "epoch": 6.88, + "learning_rate": 8.634222919937206e-07, + "loss": 0.394, + "step": 8138, + "task_loss": 0.8928271532058716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2345597892999649, + "epoch": 6.88, + "learning_rate": 8.573843738678904e-07, + "loss": 0.5523, + "step": 8139, + "task_loss": 0.22632645070552826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5438345074653625, + "epoch": 6.88, + "learning_rate": 8.513464557420601e-07, + "loss": 0.3649, + "step": 8140, + "task_loss": 0.5837914347648621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23033744096755981, + "epoch": 6.88, + "learning_rate": 8.4530853761623e-07, + "loss": 0.2821, + "step": 8141, + "task_loss": 0.10059036314487457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.285895437002182, + "epoch": 6.88, + "learning_rate": 8.392706194903998e-07, + "loss": 0.4645, + "step": 8142, + "task_loss": 0.40806180238723755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4832836985588074, + "epoch": 6.88, + "learning_rate": 8.332327013645695e-07, + "loss": 0.5162, + "step": 8143, + "task_loss": 0.6327180862426758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2849859297275543, + "epoch": 6.88, + "learning_rate": 8.271947832387393e-07, + "loss": 0.3749, + "step": 8144, + "task_loss": 0.4231637418270111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37609267234802246, + "epoch": 6.88, + "learning_rate": 8.211568651129091e-07, + "loss": 0.3352, + "step": 8145, + "task_loss": 0.5230594277381897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3001956343650818, + "epoch": 6.89, + "learning_rate": 8.151189469870788e-07, + "loss": 0.393, + "step": 8146, + "task_loss": 0.5744699239730835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46324974298477173, + "epoch": 6.89, + "learning_rate": 8.090810288612487e-07, + "loss": 0.3765, + "step": 8147, + "task_loss": 0.18940910696983337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25378215312957764, + "epoch": 6.89, + "learning_rate": 8.030431107354184e-07, + "loss": 0.4326, + "step": 8148, + "task_loss": 0.46648675203323364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42185258865356445, + "epoch": 6.89, + "learning_rate": 7.970051926095884e-07, + "loss": 0.5215, + "step": 8149, + "task_loss": 0.487506240606308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45335930585861206, + "epoch": 6.89, + "learning_rate": 7.90967274483758e-07, + "loss": 0.4531, + "step": 8150, + "task_loss": 0.5737056732177734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.505012035369873, + "epoch": 6.89, + "learning_rate": 7.849293563579277e-07, + "loss": 0.4273, + "step": 8151, + "task_loss": 1.5874508619308472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2771593928337097, + "epoch": 6.89, + "learning_rate": 7.788914382320975e-07, + "loss": 0.4523, + "step": 8152, + "task_loss": 0.39473310112953186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40684062242507935, + "epoch": 6.89, + "learning_rate": 7.728535201062674e-07, + "loss": 0.5218, + "step": 8153, + "task_loss": 0.23339396715164185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3783303201198578, + "epoch": 6.89, + "learning_rate": 7.668156019804372e-07, + "loss": 0.3988, + "step": 8154, + "task_loss": 0.7179688215255737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3663947880268097, + "epoch": 6.89, + "learning_rate": 7.60777683854607e-07, + "loss": 0.3217, + "step": 8155, + "task_loss": 0.32890433073043823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4665611982345581, + "epoch": 6.89, + "learning_rate": 7.547397657287768e-07, + "loss": 0.4, + "step": 8156, + "task_loss": 1.0287481546401978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4335521459579468, + "epoch": 6.89, + "learning_rate": 7.487018476029465e-07, + "loss": 0.4461, + "step": 8157, + "task_loss": 1.2561373710632324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.49254167079925537, + "epoch": 6.9, + "learning_rate": 7.426639294771163e-07, + "loss": 0.4501, + "step": 8158, + "task_loss": 0.45394444465637207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3290424048900604, + "epoch": 6.9, + "learning_rate": 7.366260113512861e-07, + "loss": 0.4876, + "step": 8159, + "task_loss": 0.36518895626068115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21469900012016296, + "epoch": 6.9, + "learning_rate": 7.30588093225456e-07, + "loss": 0.4137, + "step": 8160, + "task_loss": 0.3176056742668152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.25622040033340454, + "epoch": 6.9, + "learning_rate": 7.245501750996257e-07, + "loss": 0.3501, + "step": 8161, + "task_loss": 0.22923578321933746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3183137774467468, + "epoch": 6.9, + "learning_rate": 7.185122569737954e-07, + "loss": 0.3315, + "step": 8162, + "task_loss": 0.4171198606491089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.213542640209198, + "epoch": 6.9, + "learning_rate": 7.124743388479653e-07, + "loss": 0.4382, + "step": 8163, + "task_loss": 1.1673282384872437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37414249777793884, + "epoch": 6.9, + "learning_rate": 7.064364207221351e-07, + "loss": 0.4184, + "step": 8164, + "task_loss": 0.7446993589401245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2703491449356079, + "epoch": 6.9, + "learning_rate": 7.003985025963048e-07, + "loss": 0.2869, + "step": 8165, + "task_loss": 0.09363465011119843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3255982995033264, + "epoch": 6.9, + "learning_rate": 6.943605844704746e-07, + "loss": 0.4884, + "step": 8166, + "task_loss": 1.2932459115982056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39657095074653625, + "epoch": 6.9, + "learning_rate": 6.883226663446444e-07, + "loss": 0.3838, + "step": 8167, + "task_loss": 0.41045093536376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7278048396110535, + "epoch": 6.9, + "learning_rate": 6.822847482188142e-07, + "loss": 0.4919, + "step": 8168, + "task_loss": 1.0628044605255127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.612358808517456, + "epoch": 6.9, + "learning_rate": 6.762468300929839e-07, + "loss": 0.5065, + "step": 8169, + "task_loss": 0.7821404337882996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3966567814350128, + "epoch": 6.91, + "learning_rate": 6.702089119671537e-07, + "loss": 0.4038, + "step": 8170, + "task_loss": 0.31110134720802307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36263754963874817, + "epoch": 6.91, + "learning_rate": 6.641709938413236e-07, + "loss": 0.4968, + "step": 8171, + "task_loss": 0.5597882270812988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29303961992263794, + "epoch": 6.91, + "learning_rate": 6.581330757154933e-07, + "loss": 0.3814, + "step": 8172, + "task_loss": 0.6536696553230286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47450968623161316, + "epoch": 6.91, + "learning_rate": 6.520951575896631e-07, + "loss": 0.3956, + "step": 8173, + "task_loss": 1.1226543188095093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5717104077339172, + "epoch": 6.91, + "learning_rate": 6.460572394638329e-07, + "loss": 0.5377, + "step": 8174, + "task_loss": 1.3531861305236816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.67014479637146, + "epoch": 6.91, + "learning_rate": 6.400193213380027e-07, + "loss": 0.44, + "step": 8175, + "task_loss": 1.3451555967330933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6766231656074524, + "epoch": 6.91, + "learning_rate": 6.339814032121725e-07, + "loss": 0.4322, + "step": 8176, + "task_loss": 0.4496873915195465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4914029538631439, + "epoch": 6.91, + "learning_rate": 6.279434850863423e-07, + "loss": 0.4695, + "step": 8177, + "task_loss": 1.003716230392456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.12936672568321228, + "epoch": 6.91, + "learning_rate": 6.219055669605121e-07, + "loss": 0.482, + "step": 8178, + "task_loss": 0.40614667534828186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1550191342830658, + "epoch": 6.91, + "learning_rate": 6.158676488346818e-07, + "loss": 0.474, + "step": 8179, + "task_loss": 0.10459624230861664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33134132623672485, + "epoch": 6.91, + "learning_rate": 6.098297307088516e-07, + "loss": 0.3437, + "step": 8180, + "task_loss": 0.6766206622123718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.510519802570343, + "epoch": 6.91, + "learning_rate": 6.037918125830215e-07, + "loss": 0.4822, + "step": 8181, + "task_loss": 0.8180686831474304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5164583325386047, + "epoch": 6.92, + "learning_rate": 5.977538944571912e-07, + "loss": 0.5062, + "step": 8182, + "task_loss": 1.2069807052612305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.8733446598052979, + "epoch": 6.92, + "learning_rate": 5.91715976331361e-07, + "loss": 0.5766, + "step": 8183, + "task_loss": 1.321340799331665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5231645107269287, + "epoch": 6.92, + "learning_rate": 5.856780582055307e-07, + "loss": 0.3844, + "step": 8184, + "task_loss": 1.2551548480987549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37025097012519836, + "epoch": 6.92, + "learning_rate": 5.796401400797006e-07, + "loss": 0.3229, + "step": 8185, + "task_loss": 0.1682988703250885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.46883317828178406, + "epoch": 6.92, + "learning_rate": 5.736022219538703e-07, + "loss": 0.5129, + "step": 8186, + "task_loss": 1.0776530504226685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29069143533706665, + "epoch": 6.92, + "learning_rate": 5.675643038280401e-07, + "loss": 0.4101, + "step": 8187, + "task_loss": 0.7024058699607849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4763966202735901, + "epoch": 6.92, + "learning_rate": 5.615263857022099e-07, + "loss": 0.3963, + "step": 8188, + "task_loss": 0.6110609769821167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44962620735168457, + "epoch": 6.92, + "learning_rate": 5.554884675763797e-07, + "loss": 0.4656, + "step": 8189, + "task_loss": 0.6429226994514465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.42734795808792114, + "epoch": 6.92, + "learning_rate": 5.494505494505495e-07, + "loss": 0.3554, + "step": 8190, + "task_loss": 0.6999492049217224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3357987701892853, + "epoch": 6.92, + "learning_rate": 5.434126313247192e-07, + "loss": 0.2665, + "step": 8191, + "task_loss": 0.29889100790023804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5168956518173218, + "epoch": 6.92, + "learning_rate": 5.37374713198889e-07, + "loss": 0.5186, + "step": 8192, + "task_loss": 0.4611911177635193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21987923979759216, + "epoch": 6.93, + "learning_rate": 5.313367950730589e-07, + "loss": 0.3691, + "step": 8193, + "task_loss": 0.5268322229385376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37954726815223694, + "epoch": 6.93, + "learning_rate": 5.252988769472287e-07, + "loss": 0.4221, + "step": 8194, + "task_loss": 0.1988370418548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6392261981964111, + "epoch": 6.93, + "learning_rate": 5.192609588213983e-07, + "loss": 0.399, + "step": 8195, + "task_loss": 0.3287290930747986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41659075021743774, + "epoch": 6.93, + "learning_rate": 5.132230406955682e-07, + "loss": 0.4101, + "step": 8196, + "task_loss": 0.6756597757339478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5132099986076355, + "epoch": 6.93, + "learning_rate": 5.07185122569738e-07, + "loss": 0.3624, + "step": 8197, + "task_loss": 0.7284449338912964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.367946982383728, + "epoch": 6.93, + "learning_rate": 5.011472044439078e-07, + "loss": 0.3967, + "step": 8198, + "task_loss": 0.4288441836833954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26173460483551025, + "epoch": 6.93, + "learning_rate": 4.951092863180775e-07, + "loss": 0.3173, + "step": 8199, + "task_loss": 1.3075904846191406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28854647278785706, + "epoch": 6.93, + "learning_rate": 4.890713681922474e-07, + "loss": 0.3417, + "step": 8200, + "task_loss": 0.1483665257692337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.376680850982666, + "epoch": 6.93, + "learning_rate": 4.830334500664171e-07, + "loss": 0.4937, + "step": 8201, + "task_loss": 0.997186541557312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.23964419960975647, + "epoch": 6.93, + "learning_rate": 4.769955319405869e-07, + "loss": 0.3447, + "step": 8202, + "task_loss": 1.1426169872283936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43199658393859863, + "epoch": 6.93, + "learning_rate": 4.7095761381475665e-07, + "loss": 0.3444, + "step": 8203, + "task_loss": 0.79441899061203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17806360125541687, + "epoch": 6.93, + "learning_rate": 4.649196956889265e-07, + "loss": 0.2796, + "step": 8204, + "task_loss": 0.14915844798088074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2783088684082031, + "epoch": 6.94, + "learning_rate": 4.5888177756309626e-07, + "loss": 0.3704, + "step": 8205, + "task_loss": 0.7095043063163757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.19664883613586426, + "epoch": 6.94, + "learning_rate": 4.528438594372661e-07, + "loss": 0.3289, + "step": 8206, + "task_loss": 0.9144218564033508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44928330183029175, + "epoch": 6.94, + "learning_rate": 4.468059413114358e-07, + "loss": 0.3919, + "step": 8207, + "task_loss": 0.4711969494819641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20094677805900574, + "epoch": 6.94, + "learning_rate": 4.407680231856056e-07, + "loss": 0.3666, + "step": 8208, + "task_loss": 0.016046574339270592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43739840388298035, + "epoch": 6.94, + "learning_rate": 4.347301050597754e-07, + "loss": 0.4805, + "step": 8209, + "task_loss": 0.9702059030532837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.40838590264320374, + "epoch": 6.94, + "learning_rate": 4.286921869339452e-07, + "loss": 0.4104, + "step": 8210, + "task_loss": 0.33884644508361816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6758958101272583, + "epoch": 6.94, + "learning_rate": 4.22654268808115e-07, + "loss": 0.4581, + "step": 8211, + "task_loss": 1.032557487487793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21364089846611023, + "epoch": 6.94, + "learning_rate": 4.1661635068228475e-07, + "loss": 0.3657, + "step": 8212, + "task_loss": 0.40058815479278564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.298591673374176, + "epoch": 6.94, + "learning_rate": 4.1057843255645453e-07, + "loss": 0.4606, + "step": 8213, + "task_loss": 0.2349366694688797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3217827081680298, + "epoch": 6.94, + "learning_rate": 4.0454051443062436e-07, + "loss": 0.3279, + "step": 8214, + "task_loss": 0.20146390795707703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.424287885427475, + "epoch": 6.94, + "learning_rate": 3.985025963047942e-07, + "loss": 0.4528, + "step": 8215, + "task_loss": 1.092712163925171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41202759742736816, + "epoch": 6.94, + "learning_rate": 3.9246467817896386e-07, + "loss": 0.336, + "step": 8216, + "task_loss": 0.33420389890670776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4061731994152069, + "epoch": 6.95, + "learning_rate": 3.864267600531337e-07, + "loss": 0.4741, + "step": 8217, + "task_loss": 1.9225612878799438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33596500754356384, + "epoch": 6.95, + "learning_rate": 3.803888419273035e-07, + "loss": 0.4856, + "step": 8218, + "task_loss": 1.0238655805587769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3063194751739502, + "epoch": 6.95, + "learning_rate": 3.7435092380147324e-07, + "loss": 0.3719, + "step": 8219, + "task_loss": 0.2710898518562317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.43140900135040283, + "epoch": 6.95, + "learning_rate": 3.6831300567564307e-07, + "loss": 0.5023, + "step": 8220, + "task_loss": 0.6899304986000061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44983047246932983, + "epoch": 6.95, + "learning_rate": 3.6227508754981285e-07, + "loss": 0.4424, + "step": 8221, + "task_loss": 1.2916851043701172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4073679447174072, + "epoch": 6.95, + "learning_rate": 3.562371694239826e-07, + "loss": 0.3701, + "step": 8222, + "task_loss": 1.1006274223327637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.47595617175102234, + "epoch": 6.95, + "learning_rate": 3.501992512981524e-07, + "loss": 0.4143, + "step": 8223, + "task_loss": 0.5812637209892273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28813672065734863, + "epoch": 6.95, + "learning_rate": 3.441613331723222e-07, + "loss": 0.3647, + "step": 8224, + "task_loss": 0.14159630239009857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.326826810836792, + "epoch": 6.95, + "learning_rate": 3.3812341504649196e-07, + "loss": 0.408, + "step": 8225, + "task_loss": 0.08592489361763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.21296627819538116, + "epoch": 6.95, + "learning_rate": 3.320854969206618e-07, + "loss": 0.3651, + "step": 8226, + "task_loss": 0.3518364429473877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.44234588742256165, + "epoch": 6.95, + "learning_rate": 3.2604757879483156e-07, + "loss": 0.3826, + "step": 8227, + "task_loss": 0.8696706891059875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.13616123795509338, + "epoch": 6.95, + "learning_rate": 3.2000966066900134e-07, + "loss": 0.2091, + "step": 8228, + "task_loss": 0.2655141353607178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26113781332969666, + "epoch": 6.96, + "learning_rate": 3.1397174254317117e-07, + "loss": 0.4005, + "step": 8229, + "task_loss": 0.12295400351285934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.38352537155151367, + "epoch": 6.96, + "learning_rate": 3.079338244173409e-07, + "loss": 0.4943, + "step": 8230, + "task_loss": 0.5141823291778564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3979910910129547, + "epoch": 6.96, + "learning_rate": 3.018959062915107e-07, + "loss": 0.379, + "step": 8231, + "task_loss": 1.265740156173706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33287617564201355, + "epoch": 6.96, + "learning_rate": 2.958579881656805e-07, + "loss": 0.3613, + "step": 8232, + "task_loss": 0.671626627445221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3962387442588806, + "epoch": 6.96, + "learning_rate": 2.898200700398503e-07, + "loss": 0.3402, + "step": 8233, + "task_loss": 0.7090246677398682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5038058757781982, + "epoch": 6.96, + "learning_rate": 2.8378215191402006e-07, + "loss": 0.428, + "step": 8234, + "task_loss": 0.9491145014762878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5114168524742126, + "epoch": 6.96, + "learning_rate": 2.7774423378818983e-07, + "loss": 0.4507, + "step": 8235, + "task_loss": 0.7731209993362427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.13943323493003845, + "epoch": 6.96, + "learning_rate": 2.717063156623596e-07, + "loss": 0.33, + "step": 8236, + "task_loss": 0.5297982096672058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4558562636375427, + "epoch": 6.96, + "learning_rate": 2.6566839753652944e-07, + "loss": 0.3843, + "step": 8237, + "task_loss": 0.8994179368019104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5355358123779297, + "epoch": 6.96, + "learning_rate": 2.5963047941069916e-07, + "loss": 0.4074, + "step": 8238, + "task_loss": 0.723736584186554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4044015407562256, + "epoch": 6.96, + "learning_rate": 2.53592561284869e-07, + "loss": 0.3372, + "step": 8239, + "task_loss": 0.3826352655887604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3198971152305603, + "epoch": 6.96, + "learning_rate": 2.4755464315903877e-07, + "loss": 0.4066, + "step": 8240, + "task_loss": 0.4775959849357605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6953800320625305, + "epoch": 6.97, + "learning_rate": 2.4151672503320855e-07, + "loss": 0.4665, + "step": 8241, + "task_loss": 0.16682122647762299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17449912428855896, + "epoch": 6.97, + "learning_rate": 2.3547880690737833e-07, + "loss": 0.2643, + "step": 8242, + "task_loss": 0.1986398547887802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.37942612171173096, + "epoch": 6.97, + "learning_rate": 2.2944088878154813e-07, + "loss": 0.3919, + "step": 8243, + "task_loss": 0.474393755197525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.32233500480651855, + "epoch": 6.97, + "learning_rate": 2.234029706557179e-07, + "loss": 0.4255, + "step": 8244, + "task_loss": 0.49101269245147705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4243074357509613, + "epoch": 6.97, + "learning_rate": 2.173650525298877e-07, + "loss": 0.4017, + "step": 8245, + "task_loss": 0.6890111565589905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.29664698243141174, + "epoch": 6.97, + "learning_rate": 2.113271344040575e-07, + "loss": 0.4374, + "step": 8246, + "task_loss": 0.48036837577819824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.26391685009002686, + "epoch": 6.97, + "learning_rate": 2.0528921627822726e-07, + "loss": 0.3012, + "step": 8247, + "task_loss": 0.5241109728813171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3508797585964203, + "epoch": 6.97, + "learning_rate": 1.992512981523971e-07, + "loss": 0.4363, + "step": 8248, + "task_loss": 0.736172616481781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3754884600639343, + "epoch": 6.97, + "learning_rate": 1.9321338002656684e-07, + "loss": 0.3232, + "step": 8249, + "task_loss": 0.7228637933731079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7203053832054138, + "epoch": 6.97, + "learning_rate": 1.8717546190073662e-07, + "loss": 0.522, + "step": 8250, + "task_loss": 0.9883168935775757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3858627676963806, + "epoch": 6.97, + "learning_rate": 1.8113754377490642e-07, + "loss": 0.3983, + "step": 8251, + "task_loss": 0.3175954818725586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.41841402649879456, + "epoch": 6.97, + "learning_rate": 1.750996256490762e-07, + "loss": 0.5062, + "step": 8252, + "task_loss": 1.2561511993408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33357691764831543, + "epoch": 6.98, + "learning_rate": 1.6906170752324598e-07, + "loss": 0.3777, + "step": 8253, + "task_loss": 0.8780134916305542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.28846055269241333, + "epoch": 6.98, + "learning_rate": 1.6302378939741578e-07, + "loss": 0.5101, + "step": 8254, + "task_loss": 0.4271152913570404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4990662932395935, + "epoch": 6.98, + "learning_rate": 1.5698587127158559e-07, + "loss": 0.4595, + "step": 8255, + "task_loss": 0.9109876751899719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33004283905029297, + "epoch": 6.98, + "learning_rate": 1.5094795314575536e-07, + "loss": 0.3308, + "step": 8256, + "task_loss": 0.7639520764350891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.39072078466415405, + "epoch": 6.98, + "learning_rate": 1.4491003501992514e-07, + "loss": 0.3345, + "step": 8257, + "task_loss": 0.36566340923309326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3156832456588745, + "epoch": 6.98, + "learning_rate": 1.3887211689409492e-07, + "loss": 0.3101, + "step": 8258, + "task_loss": 0.8746612668037415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5381653308868408, + "epoch": 6.98, + "learning_rate": 1.3283419876826472e-07, + "loss": 0.5419, + "step": 8259, + "task_loss": 2.124971389770508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5853549242019653, + "epoch": 6.98, + "learning_rate": 1.267962806424345e-07, + "loss": 0.4934, + "step": 8260, + "task_loss": 0.6233627200126648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4489282965660095, + "epoch": 6.98, + "learning_rate": 1.2075836251660427e-07, + "loss": 0.4431, + "step": 8261, + "task_loss": 0.8354801535606384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5420633554458618, + "epoch": 6.98, + "learning_rate": 1.1472044439077406e-07, + "loss": 0.4513, + "step": 8262, + "task_loss": 0.8065614104270935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.36937177181243896, + "epoch": 6.98, + "learning_rate": 1.0868252626494385e-07, + "loss": 0.3463, + "step": 8263, + "task_loss": 0.12614281475543976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.6994350552558899, + "epoch": 6.99, + "learning_rate": 1.0264460813911363e-07, + "loss": 0.4711, + "step": 8264, + "task_loss": 0.7263739705085754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.20920465886592865, + "epoch": 6.99, + "learning_rate": 9.660669001328342e-08, + "loss": 0.4223, + "step": 8265, + "task_loss": 1.0424264669418335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.17900460958480835, + "epoch": 6.99, + "learning_rate": 9.056877188745321e-08, + "loss": 0.3473, + "step": 8266, + "task_loss": 0.41000494360923767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5167355537414551, + "epoch": 6.99, + "learning_rate": 8.453085376162299e-08, + "loss": 0.4375, + "step": 8267, + "task_loss": 1.25648033618927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.45862525701522827, + "epoch": 6.99, + "learning_rate": 7.849293563579279e-08, + "loss": 0.4071, + "step": 8268, + "task_loss": 1.5106650590896606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2271057665348053, + "epoch": 6.99, + "learning_rate": 7.245501750996257e-08, + "loss": 0.3145, + "step": 8269, + "task_loss": 0.04377155750989914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.7007173895835876, + "epoch": 6.99, + "learning_rate": 6.641709938413236e-08, + "loss": 0.5107, + "step": 8270, + "task_loss": 0.24144020676612854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4143049418926239, + "epoch": 6.99, + "learning_rate": 6.037918125830214e-08, + "loss": 0.4022, + "step": 8271, + "task_loss": 0.9066441059112549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.5543140172958374, + "epoch": 6.99, + "learning_rate": 5.434126313247193e-08, + "loss": 0.464, + "step": 8272, + "task_loss": 0.48787549138069153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.287723183631897, + "epoch": 6.99, + "learning_rate": 4.830334500664171e-08, + "loss": 0.3819, + "step": 8273, + "task_loss": 0.35248225927352905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.284237265586853, + "epoch": 6.99, + "learning_rate": 4.2265426880811495e-08, + "loss": 0.4122, + "step": 8274, + "task_loss": 0.2473331242799759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.33322736620903015, + "epoch": 6.99, + "learning_rate": 3.6227508754981285e-08, + "loss": 0.3522, + "step": 8275, + "task_loss": 0.7891225814819336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.4246176779270172, + "epoch": 7.0, + "learning_rate": 3.018959062915107e-08, + "loss": 0.4606, + "step": 8276, + "task_loss": 0.9739570617675781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.22504520416259766, + "epoch": 7.0, + "learning_rate": 2.4151672503320856e-08, + "loss": 0.3732, + "step": 8277, + "task_loss": 1.023877501487732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.3508482277393341, + "epoch": 7.0, + "learning_rate": 1.8113754377490642e-08, + "loss": 0.3548, + "step": 8278, + "task_loss": 0.9861854314804077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.2988832890987396, + "epoch": 7.0, + "learning_rate": 1.2075836251660428e-08, + "loss": 0.3086, + "step": 8279, + "task_loss": 0.17360980808734894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.577366828918457, + "epoch": 7.0, + "learning_rate": 6.037918125830214e-09, + "loss": 0.4192, + "step": 8280, + "task_loss": 0.20891474187374115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.5, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.386257320484995, + "compression/movement_sparsity/model_sparsity": 0.3729881932598306, + "compression_loss": 0.0, + "distillation_loss": 0.1905464082956314, + "epoch": 7.0, + "learning_rate": 0.0, + "loss": 0.3376, + "step": 8281, + "task_loss": 0.2361963987350464 + }, + { + "epoch": 7.0, + "step": 8281, + "total_flos": 4.176434448946852e+19, + "train_loss": 18.350530295344083, + "train_runtime": 37482.3852, + "train_samples_per_second": 14.147, + "train_steps_per_second": 0.221 + } + ], + "max_steps": 8281, + "num_train_epochs": 7, + "total_flos": 4.176434448946852e+19, + "trial_name": null, + "trial_params": null +}